In [1]:
import glob
import json
import pandas as pd
import os
import gzip
import re
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords

import numpy as np
import pandas as pd
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
#Calculate accuracy
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

  from numpy.core.umath_tests import inner1d


In [2]:
def read_data(directory):
    dfs = []
    for label in ['real', 'fake']:
        for file in glob.glob(directory + os.path.sep + label + os.path.sep + '*gz'):
            print('reading %s' % file)
            df = pd.DataFrame((json.loads(line) for line in gzip.open(file)))
            df['label'] = label
            dfs.append(df)
    df=pd.concat(dfs)[['publish_date', 'source', 'text', 'title', 'tweets', 'label']]
    list_text = [i for i in list(df.text) if i != '']
    return df[df.text.isin(list_text)]

In [3]:
directory = r'C:\Users\lenovo\Desktop\IIT\training_data_2'
df = read_data(directory)

reading C:\Users\lenovo\Desktop\IIT\training_data_2\real\real.json.gz
reading C:\Users\lenovo\Desktop\IIT\training_data_2\fake\fake.json.gz


In [4]:
def get_text(list):
    stopword=set(stopwords.words('english'))
    list_new=[]
    for l in list:
        l=re.sub(r"[^\w']",' ',l).lower()
        l1=[tokennizer(w) for w in l.split() if len(tokennizer(w))>2]
        l=' '.join(l1)
        l1=[tokennizer(w) for w in l.split() if len(tokennizer(w))>2 and tokennizer(w) not in stopword]
        l=' '.join(lemmatize(l1))
        list_new.append(l)
    return list_new

def tokennizer(s):
    s = re.sub(r'http\S+', '', s)
    s = re.sub(r'[0-9_\s]+', '', s)
    s = re.sub(r"[^'\w]+", '', s)

    s = re.compile(r"(?<=[a-zA-Z])'re").sub(' are', s)
    s = re.compile(r"(?<=[a-zA-Z])'m").sub(' am', s)
    s = re.compile(r"(?<=[a-zA-Z])'ve").sub(' have', s)
    s = re.compile(r"(it|he|she|that|this|there|here|what|where|when|who|why|which)('s)").sub(r"\1 is", s)
    s = re.sub(r"'s", "", s)
    s = re.sub(r"can't", 'can not', s)
    s = re.compile(r"(?<=[a-zA-Z])n't").sub(' not', s)
    s = re.compile(r"(?<=[a-zA-Z])'ll").sub(' will', s)
    s = re.compile(r"(?<=[a-zA-Z])'d").sub(' would', s)
    return s
def lemmatize(l):
    wnl = WordNetLemmatizer()
    for word, tag in pos_tag(l):
        if tag.startswith('NN'):
            yield wnl.lemmatize(word, pos='n')
        elif tag.startswith('VB'):
            yield wnl.lemmatize(word, pos='v')
        elif tag.startswith('JJ'):
            yield wnl.lemmatize(word, pos='a')
        elif tag.startswith('R'):
            yield wnl.lemmatize(word, pos='r')
        else:
            yield word

In [5]:
text = get_text(list(df.text))
vec1 = TfidfVectorizer(min_df=2, max_df=1., ngram_range=(1, 1),stop_words= 'english')
X = vec1.fit_transform(text)
y = np.array(df.label)


In [7]:
print('MLP----hidden_layer_sizes---')
accdf = pd.DataFrame(np.random.randn(3, 3), index=['1', '2', '3'],columns=['hidden_layer_sizes', 'Accuracy','std'])
for i,hidden_layer_sizes in zip([0,1,2],[10,50,100,200]):
    MP = MLPClassifier(hidden_layer_sizes = (hidden_layer_sizes,))
    Y = y
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    for train, test in kf.split(X):
        MP.fit(X[train], Y[train])
        pred = MP.predict(X[test])
        accuracies.append(accuracy_score(Y[test], pred))
    mean_acc = np.mean(accuracies)
    std = np.std(accuracies)
    accdf['hidden_layer_sizes'][i] = hidden_layer_sizes
    accdf['Accuracy'][i] = mean_acc
    accdf['std'][i] = std
accdf

MLP----hidden_layer_sizes---




Unnamed: 0,hidden_layer_sizes,Accuracy,std
1,10.0,0.874826,0.011283
2,50.0,0.876318,0.020331
3,100.0,0.874847,0.019561


In [8]:
print('MLP----alpha---')
accdf = pd.DataFrame(np.random.randn(3, 3), index=['1', '2', '3'],columns=['alpha', 'Accuracy','std'])
for i,alpha in zip([0,1,2],[.001,.0001,.00001]):
    MP = MLPClassifier(alpha = alpha)
    Y = y
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    for train, test in kf.split(X):
        MP.fit(X[train], Y[train])
        pred = MP.predict(X[test])
        accuracies.append(accuracy_score(Y[test], pred))
    mean_acc = np.mean(accuracies)
    std = np.std(accuracies)
    accdf['alpha'][i] = alpha
    accdf['Accuracy'][i] = mean_acc
    accdf['std'][i] = std
accdf

MLP----alpha---


Unnamed: 0,alpha,Accuracy,std
1,0.001,0.877778,0.010834
2,0.0001,0.867473,0.026211
3,1e-05,0.874837,0.018475


In [9]:
print('RandomForest----min_samples_leaf---')
accdf = pd.DataFrame(np.random.randn(3, 3), index=['1', '2', '3'],columns=['min_samples_leaf', 'Accuracy','std'])

for i,min_samples_leaf in zip([0,1,2],[1,3,5]):
    RFC = RandomForestClassifier(min_samples_leaf = min_samples_leaf)

    Y = y
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    for train, test in kf.split(X):
        RFC.fit(X[train], Y[train])
        pred = RFC.predict(X[test])
        accuracies.append(accuracy_score(Y[test], pred))
    mean_acc = np.mean(accuracies)
    std = np.std(accuracies)
    accdf['min_samples_leaf'][i] = min_samples_leaf
    accdf['Accuracy'][i] = mean_acc
    accdf['std'][i] = std
accdf
    

RandomForest----min_samples_leaf---


Unnamed: 0,min_samples_leaf,Accuracy,std
1,1.0,0.773192,0.025669
2,3.0,0.811547,0.027817
3,5.0,0.793813,0.016778


In [10]:
print('RandomForest----n_estimators---')
accdf = pd.DataFrame(np.random.randn(3, 3), index=['1', '2', '3'],columns=['n_estimators', 'Accuracy','std'])
for i,n_estimators  in zip([0,1,2],[100,200,300]):
#     print('==================n_estimators : %d ================' %(n_estimators))
    RFC = RandomForestClassifier(n_estimators = n_estimators )

    Y = y
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    accuracies = []
    for train, test in kf.split(X):
        RFC.fit(X[train], Y[train])
        pred = RFC.predict(X[test])
        accuracies.append(accuracy_score(Y[test], pred))
#         print(classification_report(Y[test], pred))
#     print('accuracy over all cross-validation folds: %s' % str(accuracies))
    mean_acc = np.mean(accuracies)
    std = np.std(accuracies)
#     print('mean=%.2f std=%.2f' % (mean_acc, std))
    accdf['n_estimators'][i] = n_estimators
    accdf['Accuracy'][i] = mean_acc
    accdf['std'][i] = std
accdf

RandomForest----n_estimators---


Unnamed: 0,n_estimators,Accuracy,std
1,100.0,0.864532,0.025607
2,200.0,0.882211,0.022646
3,300.0,0.877756,0.021136
