In [1]:
import pandas as pd
import numpy as np
import string
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import WordNetLemmatizer 
from nltk.stem import SnowballStemmer
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.model_selection import KFold


In [2]:

def preprocess(row):
    text = row['text']
    text= text.lower()
    
    regex = re.compile('[%s]' % re.escape(string.punctuation))
    text = regex.sub(' ', text)
    
    # remove remaining tokens that are not alphabetic
    text = text.split(' ')

    text = [word for word in text if word.isalpha()]
    
    # remove stop words
    stop = stopwords.words('english')
    text = [x for x in text if x not in stop]
    stemmer = SnowballStemmer("english")
    lemmatizer = WordNetLemmatizer()
    stemmed_text = []
    for word in text:
        #stemmed_text.append(stemmer.stem(word))
        stemmed_text.append(lemmatizer.lemmatize(word))
        
    text = " ".join(stemmed_text)
    row['text'] = text
    return(row)
    


In [3]:
 
path = 'C:/Users/shash/Spring Projects/Predictive/Train.csv'
licensing_df = pd.read_csv(path)
title=licensing_df[licensing_df['text'].isna()]['title']
print("Number of missing data",len(title))
licensing_df.loc[licensing_df['text'].isna(),'text']=title

columns = ['text', 'type']
data = licensing_df[columns]
data = data.apply(preprocess, axis = 1)
X = data['text']
y = data['type']


# vectorizer = TfidfVectorizer()
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



Number of missing data 534


In [4]:
licensing_df.shape

(38000, 4)

In [5]:
licensing_df.head(10)

Unnamed: 0,date,title,text,type
0,2016-03-11,Governor extends Flint water emergency as stat...,WASHINGTON (Reuters) - Michigan Governor Rick ...,real
1,2017-10-09,"DEMOCRATS CONVENIENTLY FORGET 6,000 Prisoners ...",Democrats are calling for President Trump s sc...,fake
2,2018-02-01,Mexico recognizes Honduran president as winner...,MEXICO CITY (Reuters) - Mexico recognized Hond...,real
3,2016-11-20,BOOM! Wikileaks Shows Hillary Speech To Banker...,No wonder she didn t want anyone to see her sp...,fake
4,2017-07-06,Paul Ryan says confident tax reform will pass ...,WASHINGTON (Reuters) - Republicans will be abl...,real
5,2018-02-12,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,real
6,2017-04-26,"Boeing, aerospace manufacturers back U.S. tax ...",SEATTLE (Reuters) - Boeing Co and about 90 oth...,real
7,2016-07-28,Saudi Prince Reminds Donald Trump: I Bailed Yo...,While Donald Trump continues to paint himself ...,fake
8,2016-06-21,ADMIRAL JAMES “ACE” LYONS WARNS: What The Join...,Admiral Ace Lyons has warned us all before in ...,fake
9,2017-11-18,Syrian army nears Islamic State stronghold al-...,BEIRUT (Reuters) - Syria s army and its allies...,real


In [10]:
kfold = KFold(10, True, 1)
predictions=[-1]*len(data)
# enumerate splits
for train, test in kfold.split(data):
    X_train=X[train]
    y_train=y[train]
    X_test=X[test]
    y_test=y[test]
    model = RandomForestClassifier()
    model.fit(X_train, y_train)
    predictions1 = model.predict(X_test)
    count=0
    for i in y_test.index:
        predictions[i]=predictions1[count]
        count=count+1




In [11]:
clf_report = classification_report(y, predictions)
print(clf_report)
print('Accuracy :',sum(y==predictions)*100/len(predictions))

              precision    recall  f1-score   support

        fake       0.95      0.98      0.97     19865
        real       0.98      0.95      0.96     18135

    accuracy                           0.97     38000
   macro avg       0.97      0.97      0.97     38000
weighted avg       0.97      0.97      0.97     38000

Accuracy : 96.67105263157895


In [8]:
kfold = KFold(10, True, 1)
predictions=[-1]*len(data)
# enumerate splits
for train, test in kfold.split(data):
    X_train=X[train]
    y_train=y[train]
    X_test=X[test]
    y_test=y[test]
    model = DecisionTreeClassifier()
    model.fit(X_train, y_train)
    predictions1 = model.predict(X_test)
    count=0
    for i in y_test.index:
        predictions[i]=predictions1[count]
        count=count+1


In [9]:
clf_report = classification_report(y, predictions)
print(clf_report)
print('Accuracy :',sum(y==predictions)*100/len(predictions))

              precision    recall  f1-score   support

        fake       1.00      1.00      1.00     19865
        real       1.00      1.00      1.00     18135

    accuracy                           1.00     38000
   macro avg       1.00      1.00      1.00     38000
weighted avg       1.00      1.00      1.00     38000

Accuracy : 99.67105263157895


In [6]:
kfold = KFold(10, True, 1)
predictions=[-1]*len(data)
# enumerate splits
for train, test in kfold.split(data):
    X_train=X[train]
    y_train=y[train]
    X_test=X[test]
    y_test=y[test]
    model = LogisticRegression(random_state=0)
    model.fit(X_train, y_train)
    predictions1 = model.predict(X_test)
    count=0
    for i in y_test.index:
        predictions[i]=predictions1[count]
        count=count+1




In [7]:
clf_report = classification_report(y, predictions)
print(clf_report)
print('Accuracy :',sum(y==predictions)*100/len(predictions))

              precision    recall  f1-score   support

        fake       1.00      1.00      1.00     19865
        real       1.00      1.00      1.00     18135

    accuracy                           1.00     38000
   macro avg       1.00      1.00      1.00     38000
weighted avg       1.00      1.00      1.00     38000

Accuracy : 99.62105263157895


In [12]:
kfold = KFold(10, True, 1)
predictions=[-1]*len(data)
# enumerate splits
for train, test in kfold.split(data):
    X_train=X[train]
    y_train=y[train]
    X_test=X[test]
    y_test=y[test]
    model = XGBClassifier()
    model.fit(X_train, y_train)
    predictions1 = model.predict(X_test)
    count=0
    for i in y_test.index:
        predictions[i]=predictions1[count]
        count=count+1


In [13]:
clf_report = classification_report(y, predictions)
print(clf_report)
print('Accuracy :',sum(y==predictions)*100/len(predictions))

              precision    recall  f1-score   support

        fake       1.00      0.99      1.00     19865
        real       0.99      1.00      1.00     18135

    accuracy                           1.00     38000
   macro avg       1.00      1.00      1.00     38000
weighted avg       1.00      1.00      1.00     38000

Accuracy : 99.63421052631578


In [None]:
kfold = KFold(10, True, 1)
predictions=[-1]*len(data)
# enumerate splits
for train, test in kfold.split(data):
    X_train=X[train]
    y_train=y[train]
    X_test=X[test]
    y_test=y[test]
    model = SVC(kernel='linear')
    model.fit(X_train, y_train)
    predictions1 = model.predict(X_test)
    count=0
    for i in y_test.index:
        predictions[i]=predictions1[count]
        count=count+1


In [None]:
clf_report = classification_report(y, predictions)
print(clf_report)
print('Accuracy :',sum(y==predictions)*100/len(predictions))