In [52]:
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.externals import joblib
import pandas as pd
import numpy as np

In [21]:
# Load data
# Fake news is positive. Real news is 0.
df_real = pd.read_csv("dataset/real_clean.csv")
df_real['label'] = 0

df_fake = pd.read_csv("dataset/fake_clean.csv")
df_fake['label'] = 1

# Concat the two datasets
df = pd.concat([df_real,df_fake])
df = df.reset_index(drop=True)
header = ["label","authors","date","title","text","domain"]
df = df[header]
print(df.shape)
df.to_csv("dataset/data_clean.csv",index=False,encoding='utf-8')

(18709, 6)


In [49]:
data = df.sample(frac=1).reset_index(drop=True)
data = data[:2000]

In [50]:
df_train, df_test = train_test_split(data, test_size=0.20)
print(df_train.shape)
print(df_test.shape)
sum(data['label'])

(1600, 6)
(400, 6)


1301

In [56]:
# for idx, row in data.iterrows():
#     if row["text"] is np.nan:
#         data.drop(idx, inplace=True)
#     elif isinstance(row["text"], str):
#         data.set_value(idx, "text", clean_content(row["text"]))


count_vect = CountVectorizer(stop_words='english')
X_train_counts = count_vect.fit_transform(df_train['text'])
print(X_train_counts.shape)

X_test_counts = count_vect.transform(df_test['text'])
print(X_test_counts.shape)

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)
print(X_train_tfidf.shape)
print(X_test_tfidf.shape)



# add punctuation features
mnb_clf = MultinomialNB()
mnb_clf.fit(X_train_tfidf, df_train['label'])
ada_clf = AdaBoostClassifier(n_estimators=100).fit(X_train_tfidf, df_train['label'])
# text_clf = SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)
# svm_clf = text_clf.fit(X_train_tfidf, df_train['label'])
svm_clf = LinearSVC().fit(X_train_tfidf, df_train['label'])

# joblib.dump(tfidf_transformer, 'tfidf_transformer.pkl')
# joblib.dump(count_vect, 'count_vect.pkl')
# joblib.dump(mnb_clf, 'mnb_clf.pkl')
# joblib.dump(svm_clf, 'svm_clf.pkl')
# joblib.dump(ada_clf, 'ada_clf.pkl')

(1600, 42196)
(400, 42196)
(1600, 42196)
(400, 42196)




In [57]:
predicted = mnb_clf.predict(X_test_tfidf)
ada_predictions = ada_clf.predict(X_test_tfidf)
svm_predictions = svm_clf.predict(X_test_tfidf)

ada_score = np.mean(ada_predictions == df_test['label'])
mnb_score = np.mean(predicted == df_test['label'])
svm_score = np.mean(svm_predictions == df_test['label'])

print("MNB: ",mnb_score)
print("ADA: ",ada_score)
print("SVM: ",svm_score)

sketchy_score = (mnb_score + ada_score + svm_score) / 3.0
# sketchy_score = (mnb_score)
print("Sketchy score: ",sketchy_score)

MNB:  0.78
ADA:  0.915
SVM:  0.8875
Sketchy score:  0.860833333333
