In [71]:
import pandas as pd
import re
import jieba
from sklearn.feature_extraction.text import CountVectorizer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import os

# English Data Preprocessing

In [88]:
# Function to clean and tokenize English text
def clean_english_text(text):
    text = re.sub(r'http\S+', 'URL', text)  # Replace URLs with URL
    text = re.sub(r'\S+@\S+', 'EMAIL', text)  # Replace email addresses with EMAIL
    text = re.sub(r'\d{3,}', 'PHONENUMBER', text)      # Replace phone numbers with PHONE
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = text.lower()                  # Convert to lowercase
    return text

# Function to check for phishing indicators and clean text
def check_and_clean_text(text):
    has_url = bool(re.search(r'URL', text))
    has_email = bool(re.search(r'EMAIL', text))
    has_phone = bool(re.search(r'PHONE', text))
    cleaned_text = clean_english_text(text)
    return cleaned_text, has_url, has_email, has_phone

# Load Mendeley dataset
def load_mendeley_csv(file_path):
    df = pd.read_csv(file_path)
    df['LABEL'] = df['LABEL'].map({'ham': 0, 'spam': 1, 'Smishing': 2})
    df['CLEANED_TEXT'], df['HAS_URL'], df['HAS_EMAIL'], df['HAS_PHONE'] = zip(*df['TEXT'].map(check_and_clean_text))
    df['TOKENIZED_TEXT'] = df['CLEANED_TEXT'].apply(lambda x: x.split())
    return df

# Load SPAM SMS dataset
def load_spam_sms(file_path):
    with open(file_path, 'r', encoding='latin-1') as f:
        data = [line.strip().split('\t') for line in f.readlines()]
    df = pd.DataFrame(data, columns=['LABEL', 'TEXT'])
    df['LABEL'] = df['LABEL'].map({'ham': 0, 'spam': 1, 'Smishing': 2})
    df['CLEANED_TEXT'], df['HAS_URL'], df['HAS_EMAIL'], df['HAS_PHONE'] = zip(*df['TEXT'].map(check_and_clean_text))
    df['TOKENIZED_TEXT'] = df['CLEANED_TEXT'].apply(lambda x: x.split())
    return df

In [95]:
mendeley_df = load_mendeley_csv('mendeley/Dataset_5971.csv')
uci_df = load_spam_sms('uci/SMSSpamCollection')
english_df = pd.concat([mendeley_df,uci_df])
english_df


Unnamed: 0,LABEL,TEXT,URL,EMAIL,PHONE,CLEANED_TEXT,HAS_URL,HAS_EMAIL,HAS_PHONE,TOKENIZED_TEXT
0,0.0,Your opinion about me? 1. Over 2. Jada 3. Kusr...,No,No,No,your opinion about me 1 over 2 jada 3 kusruthi...,False,False,False,"[your, opinion, about, me, 1, over, 2, jada, 3..."
1,0.0,What's up? Do you want me to come online? If y...,No,No,No,whats up do you want me to come online if you ...,False,False,False,"[whats, up, do, you, want, me, to, come, onlin..."
2,0.0,So u workin overtime nigpun?,No,No,No,so u workin overtime nigpun,False,False,False,"[so, u, workin, overtime, nigpun]"
3,0.0,"Also sir, i sent you an email about how to log...",No,No,No,also sir i sent you an email about how to log ...,False,False,False,"[also, sir, i, sent, you, an, email, about, ho..."
4,2.0,Please Stay At Home. To encourage the notion o...,No,No,No,please stay at home to encourage the notion of...,False,False,False,"[please, stay, at, home, to, encourage, the, n..."
...,...,...,...,...,...,...,...,...,...,...
5569,1.0,This is the 2nd time we have tried 2 contact u...,,,,this is the 2nd time we have tried 2 contact u...,False,False,False,"[this, is, the, 2nd, time, we, have, tried, 2,..."
5570,0.0,Will Ã¼ b going to esplanade fr home?,,,,will ã¼ b going to esplanade fr home,False,False,False,"[will, ã¼, b, going, to, esplanade, fr, home]"
5571,0.0,"Pity, * was in mood for that. So...any other s...",,,,pity was in mood for that soany other suggest...,False,False,False,"[pity, was, in, mood, for, that, soany, other,..."
5572,0.0,The guy did some bitching but I acted like i'd...,,,,the guy did some bitching but i acted like id ...,False,False,False,"[the, guy, did, some, bitching, but, i, acted,..."


# Chinese Data Preprocessing

In [91]:
# Load stopwords
stopwords_path = 'ChineseTextClassification/data/hit_stopwords.txt'
with open(stopwords_path, 'r', encoding='utf-8') as f:
    stopwords = [line.strip() for line in f.readlines()]

# Function to clean and tokenize Chinese text
def clean_chinese_text(text):
    text = text.lower()                  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    text = re.sub(r'x', '', text)  # Remove special characters x
    return text
# Function to check for phishing indicators and clean text
def check_and_clean_chinese_text(text):
    has_url = bool(re.search(r'URL', text))
    has_email = bool(re.search(r'EMAIL', text))
    has_phone = bool(re.search(r'PHONE', text))
    cleaned_text = clean_chinese_text(text)
    return cleaned_text, has_url, has_email, has_phone
# Load Chinese Text Classification dataset
def load_chinese_text_classification(file_path):
    df = pd.read_csv(file_path, sep='\t', header=None, names=['LABEL', 'TEXT'])
    df['CLEANED_TEXT'], df['HAS_URL'], df['HAS_EMAIL'], df['HAS_PHONE'] = zip(*df['TEXT'].map(check_and_clean_chinese_text))
    df['TOKENIZED_TEXT'] = df['CLEANED_TEXT'].apply(lambda x: [word for word in jieba.cut(x) if word not in stopwords])

    return df

# Load FBS SMS Dataset with labels based on filenames
def load_fbs_sms(directory_path):
    data = []
    for file_name in os.listdir(directory_path):
        file_path = os.path.join(directory_path, file_name)
        label = file_name.split('.')[0]  # Assuming the label is the part of the filename before the extension
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f.readlines():
                line = line.strip()
                if line:
                    data.append([label, line])  # Use the label from the filename
    df = pd.DataFrame(data, columns=['LABEL', 'TEXT'])
    df['CLEANED_TEXT'], df['HAS_URL'], df['HAS_EMAIL'], df['HAS_PHONE'] = zip(*df['TEXT'].map(check_and_clean_chinese_text))
    df['TOKENIZED_TEXT'] = df['CLEANED_TEXT'].apply(lambda x: [word for word in jieba.cut(x) if word not in stopwords])
    return df


In [92]:
chinese_text_classification_df = load_chinese_text_classification('ChineseTextClassification/data/train.txt')
fbs_sms_df = load_fbs_sms('FBS_SMS_Dataset/data')
# Update labels for FBS SMS dataset
fbs_sms_df['LABEL'] = fbs_sms_df['LABEL'].apply(lambda x: 1 if x.startswith('AD') else (2 if x.startswith('FR') else x))
chinese_df = pd.concat([chinese_text_classification_df,fbs_sms_df])
chinese_df


Unnamed: 0,LABEL,TEXT,CLEANED_TEXT,HAS_URL,HAS_EMAIL,HAS_PHONE,TOKENIZED_TEXT
0,0,商业秘密的秘密性那是维系其商业价值和垄断地位的前提条件之一,商业秘密的秘密性那是维系其商业价值和垄断地位的前提条件之一,False,False,False,"[商业秘密, 秘密性, 维系, 商业价值, 垄断, 地位, 前提条件]"
1,1,南口阿玛施新春第一批限量春装到店啦   春暖花开淑女裙、冰蓝色公主衫 ...,南口阿玛施新春第一批限量春装到店啦 春暖花开淑女裙冰蓝色公主衫 气质粉小...,False,False,False,"[南口, 阿玛施, 新春, 第一批, 限量, 春装, 店, , , , , , ..."
2,0,带给我们大常州一场壮观的视觉盛宴,带给我们大常州一场壮观的视觉盛宴,False,False,False,"[带给, 大, 常州, 一场, 壮观, 视觉, 盛宴]"
3,0,有原因不明的泌尿系统结石等,有原因不明的泌尿系统结石等,False,False,False,"[原因, 不明, 泌尿系统, 结石]"
4,0,23年从盐城拉回来的麻麻的嫁妆,23年从盐城拉回来的麻麻的嫁妆,False,False,False,"[23, 年, 盐城, 拉回来, 麻麻, 嫁妆]"
...,...,...,...,...,...,...,...
11391,1,PLACE 移动 两周年 感恩 大 回馈 套餐 订购 有 礼 活动 惊喜 来 袭 本月 回复...,place 移动 两周年 感恩 大 回馈 套餐 订购 有 礼 活动 惊喜 来 袭 本月 回复...,False,False,False,"[place, , 移动, , 两周年, , 感恩, , 大, , 回馈, , ..."
11392,1,您好 请 回复 您 的 或 位 的 身份证 号码 信息 超级 流量 NAME 给 你 一次 ...,您好 请 回复 您 的 或 位 的 身份证 号码 信息 超级 流量 name 给 你 一次 ...,True,False,False,"[您好, , 请, , 回复, , , , , 位, , , 身份证, ,..."
11393,1,辞旧迎新 NAME 的 流量 嗨 翻天 百万 流量 红包 疯狂 抢 疯狂 派 开心 答题 N...,辞旧迎新 name 的 流量 嗨 翻天 百万 流量 红包 疯狂 抢 疯狂 派 开心 答题 n...,True,False,False,"[辞旧迎新, , name, , , 流量, , 嗨, , 翻天, , 百万, ..."
11394,1,游戏 NAME 尊贵 的 用户 您好 PLACE 联通 为 您 提供 新花 千骨 独家 礼包...,游戏 name 尊贵 的 用户 您好 place 联通 为 您 提供 新花 千骨 独家 礼包...,True,False,False,"[游戏, , name, , 尊贵, , , 用户, , 您好, , place..."


In [97]:
# Save each DataFrame to a CSV file
mendeley_df.to_csv('processed_data/mendeley_df.csv', index=False)
uci_df.to_csv('processed_data/uci_df.csv', index=False)
chinese_text_classification_df.to_csv('processed_data/chinese_text_classification_df.csv', index=False)
fbs_sms_df.to_csv('processed_data/fbs_sms_df.csv', index=False)

print("All DataFrames have been saved as CSV files.")

All DataFrames have been saved as CSV files.


## Top words


In [94]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Function to get top N words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(token_pattern=r"(?u)\b\w+\b").fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

# Function to print top words
def print_top_words(words_freq, title):
    print(f"\n{title}")
    print(f"{'Word':<20} {'Frequency'}")
    print("="*30)
    for word, freq in words_freq:
        print(f"{word:<20} {freq}")

# Analyze and print top words for each label in the dataframe
def analyze_dataframe_by_label(df, title, labels):
    for label in labels:
        filtered_df = df[df['LABEL'] == label]
        words_freq = get_top_n_words(filtered_df['CLEANED_TEXT'], 20)
        print_top_words(words_freq, f"{title} (Label {label})")



# Analyze Mendeley dataset
analyze_dataframe_by_label(mendeley_df, 'Top 20 Words in Mendeley Dataset', [1, 2])

# Analyze UCI SMS dataset
analyze_dataframe_by_label(uci_df, 'Top 20 Words in UCI SMS Dataset', [1])

# Analyze Chinese Text Classification dataset
analyze_dataframe_by_label(chinese_text_classification_df, 'Top 20 Words in Chinese Text Classification Dataset', [1])

# Analyze FBS SMS dataset
analyze_dataframe_by_label(fbs_sms_df, 'Top 20 Words in FBS SMS Dataset', [1, 2])



Top 20 Words in Mendeley Dataset (Label 1)
Word                 Frequency
phonenumber          513
to                   445
a                    160
the                  157
free                 151
for                  149
your                 137
now                  126
txt                  119
call                 115
2                    109
and                  103
you                  101
on                   94
ur                   93
or                   91
stop                 90
text                 81
get                  80
4                    79

Top 20 Words in Mendeley Dataset (Label 2)
Word                 Frequency
phonenumber          997
to                   522
call                 386
you                  363
a                    355
your                 322
have                 173
claim                166
for                  153
prize                138
is                   134
or                   132
customer             130
now                  118
free   

In [55]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# Function to get top N words
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(token_pattern=r"(?u)\b\w+\b").fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
    return words_freq[:n]

# Function to print top words
def print_top_words(words_freq, title):
    print(f"{title}")
    print(f"{'Word':<20} {'Frequency'}")
    print("="*30)
    for word, freq in words_freq:
        print(f"{word:<20} {freq}")

# Analyze and print top words for each dataframe
def analyze_dataframe(df, title):
    words_freq = get_top_n_words(df['CLEANED_TEXT'], 20)
    print_top_words(words_freq, title)



# Analyze Mendeley dataset
analyze_dataframe(mendeley_df, 'Top 20 Words in Mendeley Dataset')

# Analyze SPAM SMS dataset
analyze_dataframe(uci_df, 'Top 20 Words in SPAM SMS Dataset')

# Analyze Chinese Text Classification dataset
analyze_dataframe(chinese_text_classification_df, 'Top 20 Words in Chinese Text Classification Dataset')

# Analyze FBS SMS dataset
analyze_dataframe(fbs_sms_df, 'Top 20 Words in FBS SMS Dataset')



Top 20 Words in Mendeley Dataset
Word                 Frequency
example              1
text                 1
for                  1
mendeley             1
dataset              1
Top 20 Words in SPAM SMS Dataset
Word                 Frequency
phone                2350
to                   2251
i                    2239
you                  2128
a                    1442
the                  1333
u                    1132
and                  971
is                   893
in                   888
me                   791
my                   757
for                  710
your                 677
it                   622
of                   620
call                 578
have                 576
on                   536
that                 514
Top 20 Words in Chinese Text Classification Dataset
Word                 Frequency
0                    29376
1                    3269
xxxx                 1451
x                    782
xxxxxxxxxxx          665
xxx                  401
xx           