In [20]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix
from sklearn import svm
from sklearn.linear_model import LogisticRegression

In [3]:
train = pd.read_csv("./Dataset/Constraint_English_Train - Sheet1.csv")
val = pd.read_csv("./Dataset/Constraint_English_Val - Sheet1.csv")

In [4]:
lab = {'fake': 1,'real': 0}
train.label = [lab[item] for item in train.label]
val.label = [lab[item] for item in val.label]

In [5]:
X_train = np.array(train.tweet)
y_train = np.array(train.label)
X_test = np.array(val.tweet)
y_test = np.array(val.label)

In [6]:
count_vect = CountVectorizer(lowercase=False)
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

tfidf_transformer= TfidfTransformer(use_idf=True, smooth_idf=True, sublinear_tf=False)

X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts)

rf = tree.DecisionTreeClassifier(max_depth=20)
rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)
acc = accuracy_score(y_test,y_pred)
print("Decision Trees")
print ("Accuracy", float("{0:.2f}".format(acc*100)))

Decision Trees
Accuracy 87.48


In [7]:
neigh = KNeighborsClassifier(n_neighbors=3, weights="distance", algorithm="brute")
neigh.fit(X_train_tfidf,y_train)
y_pred = neigh.predict(X_test_tfidf)
acc = accuracy_score(y_test,y_pred)
print("k-NN")
print ("Accuracy", float("{0:.2f}".format(acc*100)))

k-NN
Accuracy 92.06


In [10]:
# Preprocessing - count vectorizer
count_vect = CountVectorizer(lowercase=False)
X_train_counts = count_vect.fit_transform(X_train)
X_test_counts = count_vect.transform(X_test)

In [27]:
svmclf = svm.SVC()
svmclf.fit(X_train_counts, y_train)
y_pred = svmclf.predict(X_test_counts)
acc = accuracy_score(y_test,y_pred)
print("SVM")
print ("Accuracy", float("{0:.2f}".format(acc*100)))

SVM
Accuracy 93.74


In [16]:
# Preprocessing - Tfid Vectorizer
tfidf_transformer= TfidfTransformer(use_idf=True, smooth_idf=True, sublinear_tf=False)
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_test_tfidf = tfidf_transformer.transform(X_test_counts)

In [18]:
svmclf = svm.SVC()
svmclf.fit(X_train_tfidf, y_train)
y_pred = svmclf.predict(X_test_tfidf)
acc = accuracy_score(y_test,y_pred)
print("SVM")
print ("Accuracy", float("{0:.2f}".format(acc*100)))

SVM
Accuracy 94.25


In [33]:
lrclf = LogisticRegression(random_state=0)
lrclf.fit(X_train_counts, y_train)
y_pred = lrclf.predict(X_test_counts)
acc = accuracy_score(y_test,y_pred)
print("LR - CV")
print ("Accuracy", float("{0:.2f}".format(acc*100)))

LR - CV
Accuracy 93.6


In [30]:
lrclf = LogisticRegression(random_state=0)
lrclf.fit(X_train_tfidf, y_train)
y_pred = lrclf.predict(X_test_tfidf)
acc = accuracy_score(y_test,y_pred)
print("LR - tfidf ")
print ("Accuracy", float("{0:.2f}".format(acc*100)))

LR - tfidf 
Accuracy 92.9
