### fasttext训练cooking.stackexchange.txt

In [None]:
import fasttext

model = fasttext.train_supervised(input="./data/cooking.stackexchange.txt", epoch=25, dim = 200)

model.save_model("./model/cs.bin")


In [4]:
# 文本分类
model = fasttext.load_model("./model/cs.bin")
print(model.predict("Which baking dish is best to bake a banana bread ?"))
print(model.predict("Why not put knives in the dishwasher?"))
print(model.predict("What is the difference between white and brown eggs?"))


(('__label__baking',), array([0.6734696]))
(('__label__equipment',), array([0.70369226]))
(('__label__eggs',), array([0.57913929]))


### Fake News Classification

#### 数据预处理

In [1]:
# 读取csv文件
import pandas as pd 
df = pd.read_csv("./data/WELFake_Dataset.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,title,text,label
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,1
1,1,,Did they post their votes for Hillary already?,1
2,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",1
3,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,0
4,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",1
5,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,1
6,6,DR BEN CARSON TARGETED BY THE IRS: “I never ha...,DR. BEN CARSON TELLS THE STORY OF WHAT HAPPENE...,1
7,7,HOUSE INTEL CHAIR On Trump-Russia Fake Story: ...,,1
8,8,Sports Bar Owner Bans NFL Games…Will Show Only...,"The owner of the Ringling Bar, located south o...",1
9,9,Latest Pipeline Leak Underscores Dangers Of Da...,"FILE – In this Sept. 15, 2005 file photo, the ...",1


In [2]:
df.isna().sum()

Unnamed: 0      0
title         558
text           39
label           0
dtype: int64

In [3]:
# 将title和text合并
df['title_text'] = df['title'] + ' ' + df['text']

In [4]:
# 统计每个类别的数量
print(df['label'].value_counts())

# 总数据量
print(df.shape)



label
1    37106
0    35028
Name: count, dtype: int64
(72134, 5)


In [5]:
# 删掉title_text为空的行
df = df.dropna(subset=['title_text'])

In [6]:
# 统计每个类别的数量
print(df['label'].value_counts())

# 总数据量
print(df.shape)

label
1    36509
0    35028
Name: count, dtype: int64
(71537, 5)


In [7]:
# 转换类别,1和0标签互换
df['label'] = df['label'].apply(lambda x: 1 if x == 0 else 0)

In [None]:
# 对title_text进行预处理
import re
import string
import nltk
import swifter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
stop_words = set(nltk.corpus.stopwords.words('english'))
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
pattern_digits = re.compile(r'\d+')

def clean_text(text):
    # 去除英文标点符号和特殊字符
    text = re.sub(r'[^\w\s]', '', text)
    # 去除数字
    text = re.sub(pattern_digits, '', text)
    # 转换为小写
    text = text.lower()
    # 去除两端空白
    text = text.strip()
    # 分词
    tokens = word_tokenize(text)
    # 去除停用词
    tokens = [token for token in tokens if token not in stop_words]
    # 词形还原
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

df['title_text'] = df['title_text'].swifter.apply(clean_text)



[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sgp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sgp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Pandas Apply: 100%|██████████| 71537/71537 [03:04<00:00, 388.01it/s]


In [None]:
# 对title_text进行预处理
import re
import string
import nltk
import swifter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import unicodedata

nltk.download('punkt')
nltk.download('wordnet')

stop_words = set()
with open('./data/en_stopwords.txt', 'r') as f:
    for line in f:
        stop_words.add(line.strip())

lemmatizer = WordNetLemmatizer()
pattern_digits = re.compile(r'\d+')

def remove_unicode_punctuation(text):
    return ''.join(ch for ch in text if not unicodedata.category(ch).startswith('P'))

def clean_text(text):
    # 去除英文标点符号和特殊字符
    text = remove_unicode_punctuation(text)
    # 去除数字
    text = re.sub(pattern_digits, '', text)
    # 转换为小写
    text = text.lower()
    # 去除两端空白
    text = text.strip()
    # 分词
    tokens = word_tokenize(text)
    # 去除停用词
    tokens = [token for token in tokens if token not in stop_words]
    # 词形还原
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)

df['title_text'] = df['title_text'].swifter.apply(clean_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sgp\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sgp\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
Pandas Apply: 100%|██████████| 71537/71537 [03:19<00:00, 358.29it/s]


In [45]:
# 保存处理后的数据
df.to_csv("./data/WELFake_Dataset_clean.csv", index=False)

In [46]:
df = pd.read_csv("./data/WELFake_Dataset_clean.csv")
df.head(5)

Unnamed: 0.1,Unnamed: 0,title,text,label,title_text
0,0,LAW ENFORCEMENT ON HIGH ALERT Following Threat...,No comment is expected from Barack Obama Membe...,0,law enforcement alert threat cop white blackli...
1,2,UNBELIEVABLE! OBAMA’S ATTORNEY GENERAL SAYS MO...,"Now, most of the demonstrators gathered last ...",0,unbelievable obama attorney charlotte rioter p...
2,3,"Bobby Jindal, raised Hindu, uses story of Chri...",A dozen politically active pastors came here f...,1,bobby jindal raised hindu story christian conv...
3,4,SATAN 2: Russia unvelis an image of its terrif...,"The RS-28 Sarmat missile, dubbed Satan 2, will...",0,satan russia unvelis image terrifying supernuk...
4,5,About Time! Christian Group Sues Amazon and SP...,All we can say on this one is it s about time ...,0,time christian sue amazon splc designation hat...


#### fasttext词向量训练

In [47]:
# 保存fasttext格式的数据
with open("./data/WELFake_Dataset_clean_fasttext.txt", "w") as f:
    for index, row in df.iterrows():
        f.write(f"__label__{row['label']} {row['title_text']}\n")


In [48]:
import fasttext

In [50]:

model = fasttext.train_supervised(input="./data/WELFake_Dataset_clean_fasttext.txt", epoch=50, dim = 100, lr = 0.3, loss = 'hs',minCount = 2, wordNgrams = 2)

model.save_model("./model/wel.bin")


In [51]:
model = fasttext.load_model("./model/wel.bin")

In [76]:
# 预测新闻文本
with open("./data/test.txt", "r") as f:
    text = f.read()
text = clean_text(text)
print(model.predict(text))

(('__label__0',), array([0.99999928]))


#### TF-IFD 词向量训练

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import joblib
import pandas as pd


In [57]:

# 读取数据
df = pd.read_csv("./data/WELFake_Dataset_clean.csv")

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(df['title_text'], df['label'], test_size=0.3)

# tf-idf
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)



In [65]:

# 逻辑回归
clf = LogisticRegression()
clf.fit(X_train, y_train)
print(clf.score(X_test, y_test))

# 保存模型
joblib.dump(clf, "./model/lr.pkl")



0.9416643369676638


['./model/lr.pkl']

In [66]:
# 加载模型
clf = joblib.load("./model/lr.pkl")

In [77]:
# 预测

with open("./data/test.txt", "r") as f:
    text = f.read()
text = clean_text(text)
text = vectorizer.transform([text])
print(clf.predict(text))


[0]
