# Import Libraries

In [1]:
import nltk
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report


# Read the dataset

In [2]:
df = pd.read_csv(r'D:\ml & nlp project\train.csv')

# EDA

In [3]:
df.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20800 non-null  int64 
 1   title   20242 non-null  object
 2   author  18843 non-null  object
 3   text    20761 non-null  object
 4   label   20800 non-null  int64 
dtypes: int64(2), object(3)
memory usage: 812.6+ KB


In [5]:
df.nunique()

id        20800
title     19803
author     4201
text      20386
label         2
dtype: int64

In [6]:
df.isnull().sum()

id           0
title      558
author    1957
text        39
label        0
dtype: int64

In [7]:
df['label'].value_counts()

label
1    10413
0    10387
Name: count, dtype: int64

In [8]:
df.fillna(' ', inplace=True)

In [9]:
df['content'] = df['title'] + ' ' + df['text']

In [10]:
df.head()

Unnamed: 0,id,title,author,text,label,content
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0,"FLYNN: Hillary Clinton, Big Woman on Campus - ..."
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1,Why the Truth Might Get You Fired Why the Trut...
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1,15 Civilians Killed In Single US Airstrike Hav...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1,Iranian woman jailed for fictional unpublished...


# Splitting data into train and test

In [11]:
train, test = train_test_split(df, test_size=0.2, random_state=42)

# Data cleaning and preprocessing

In [12]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Omar Ayman\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [13]:
from sklearn.base import BaseEstimator, TransformerMixin
sett = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
port_stem = nltk.PorterStemmer()

class Preprocessing(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self
    
    
    def clean_text(sekf, text):
        text = text.lower()
        text = re.sub(r'\d+', '', text)
        text = text.translate(str.maketrans('', '', string.punctuation))
        text = text.strip()
        text = word_tokenize(text)
        text = [word for word in text if word not in sett]
        text = [lemmatizer.lemmatize(word) for word in text]
        text = ' '.join(text)
    
        return text
    
    
    def stemming(self, content):
        review = re.sub('[^a-zA-Z]',' ',content)
        review = review.lower()
        review = review.split()
        review = [port_stem.stem(word) for word in review if not word in sett]
        review = ' '.join(review)
        return review

    def transform(self, X, y=None):
        return X.apply(lambda text: self.stemming(self.clean_text(text)))

In [14]:
from sklearn.pipeline import Pipeline

content_pipeline = Pipeline([
        ('preprocess', Preprocessing())
])

In [15]:
prepared_train=train.copy()
prepared_train['content']=content_pipeline.fit_transform(prepared_train['content'])

prepared_test=test.copy()
prepared_test['content']=content_pipeline.transform(prepared_test['content'])

In [16]:
prepared_train.head()

Unnamed: 0,id,title,author,text,label,content
5933,5933,"Raccoons in Central Park Draw Crowds, and Warn...",Sarah Maslin Nir,"Like a little crew of bandits, the first few...",0,raccoon central park draw crowd warn stay away...
9646,9646,Bolton on Susan Rice Scandal: Obama Needs to B...,John Hayward,"On Thursday’s Breitbart News Daily, SiriusXM h...",0,bolton susan rice scandal obama need ask knew ...
8642,8642,IF HILLARY CLINTON IS CHARGED WITH OBSTRUCTION...,Iron Sheik,Home › POLITICS › IF HILLARY CLINTON IS CHARGE...,1,hillari clinton charg obstruct justic could go...
15126,15126,New York Times’s Moscow Bureau Was Targeted by...,Nicole Perlroth and David E. Sanger,The New York Times’s Moscow bureau was the tar...,0,new york time moscow bureau target hacker new ...
3410,3410,Gays Against Sharia March Branded ’Islamophobi...,Donna Rachel Edmunds,An event organised by Gays Against Sharia to m...,0,gay sharia march brand islamophob picket lgbt ...


In [17]:

tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(prepared_train['content'])
y_train = prepared_train['label']

X_test = tfidf.transform(prepared_test['content'])
y_test = prepared_test['label']

# Machine learning models

**Logestic regression**

In [18]:
from sklearn.metrics import accuracy_score , ConfusionMatrixDisplay , classification_report , roc_curve

from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)

# evaluation
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)

print(f'train score: {classification_report(y_train, train_pred)}')
print(f'test score: {classification_report(y_test, test_pred)}')

train score:               precision    recall  f1-score   support

           0       0.98      0.97      0.97      8255
           1       0.97      0.98      0.97      8385

    accuracy                           0.97     16640
   macro avg       0.97      0.97      0.97     16640
weighted avg       0.97      0.97      0.97     16640

test score:               precision    recall  f1-score   support

           0       0.95      0.94      0.95      2132
           1       0.94      0.95      0.95      2028

    accuracy                           0.95      4160
   macro avg       0.95      0.95      0.95      4160
weighted avg       0.95      0.95      0.95      4160



**Naive Bayes**

In [19]:
# NB model
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
print(f'train score: {classification_report(y_train, train_pred)}')
print(f'test score: {classification_report(y_test, test_pred)}')

train score:               precision    recall  f1-score   support

           0       0.84      0.99      0.91      8255
           1       0.99      0.82      0.90      8385

    accuracy                           0.90     16640
   macro avg       0.92      0.90      0.90     16640
weighted avg       0.92      0.90      0.90     16640

test score:               precision    recall  f1-score   support

           0       0.80      0.99      0.88      2132
           1       0.98      0.74      0.84      2028

    accuracy                           0.87      4160
   macro avg       0.89      0.86      0.86      4160
weighted avg       0.89      0.87      0.86      4160



**Decision Tree**

In [20]:
# dt model
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
print(f'train score: {classification_report(y_train, train_pred)}')
print(f'test score: {classification_report(y_test, test_pred)}')

train score:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8255
           1       1.00      1.00      1.00      8385

    accuracy                           1.00     16640
   macro avg       1.00      1.00      1.00     16640
weighted avg       1.00      1.00      1.00     16640

test score:               precision    recall  f1-score   support

           0       0.94      0.94      0.94      2132
           1       0.93      0.94      0.94      2028

    accuracy                           0.94      4160
   macro avg       0.94      0.94      0.94      4160
weighted avg       0.94      0.94      0.94      4160



In [21]:
from sklearn.model_selection import cross_val_score
results = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=10)
results

In [None]:
print(f'score: {results.mean()}')

score: 0.933233173076923


**Random forest**

In [None]:
# rf model
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
print(f'train score: {classification_report(y_train, train_pred)}')
print(f'test score: {classification_report(y_test, test_pred)}')

train score:               precision    recall  f1-score   support

           0       1.00      1.00      1.00      8255
           1       1.00      1.00      1.00      8385

    accuracy                           1.00     16640
   macro avg       1.00      1.00      1.00     16640
weighted avg       1.00      1.00      1.00     16640

test score:               precision    recall  f1-score   support

           0       0.90      0.96      0.93      2132
           1       0.95      0.89      0.92      2028

    accuracy                           0.92      4160
   macro avg       0.93      0.92      0.92      4160
weighted avg       0.92      0.92      0.92      4160



**Grid Search**

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = [
 {'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
 {'bootstrap': [False], 'n_estimators': [3, 10, 30], 'max_features': [2, 4, 6, 8]},
]

grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', return_train_score=True)

grid_search.fit(X_train, y_train)

KeyboardInterrupt: 

In [None]:
grid_search.best_params_

{'max_features': 4, 'n_estimators': 30}

In [None]:
grid_search.best_score_

0.8233173076923077

**KNN**

In [None]:
# load KNeighborsClassifier
from sklearn.neighbors import KNeighborsClassifier
model = KNeighborsClassifier()
model.fit(X_train, y_train)
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
print(f'train score: {classification_report(y_train, train_pred)}')
print(f'test score: {classification_report(y_test, test_pred)}')

train score:               precision    recall  f1-score   support

           0       0.98      0.24      0.39      8255
           1       0.57      0.99      0.73      8385

    accuracy                           0.62     16640
   macro avg       0.77      0.62      0.56     16640
weighted avg       0.77      0.62      0.56     16640

test score:               precision    recall  f1-score   support

           0       0.97      0.20      0.33      2132
           1       0.54      0.99      0.70      2028

    accuracy                           0.59      4160
   macro avg       0.75      0.60      0.52      4160
weighted avg       0.76      0.59      0.51      4160



**Voting System**

In [None]:
from sklearn.ensemble import VotingClassifier

vot = VotingClassifier(
    estimators=[
        ('Logestic Regression', LogisticRegression()),
        ('Naive Bayes', MultinomialNB()),
        ('Descision Tree', DecisionTreeClassifier()),
        ('Random Forest', RandomForestClassifier(n_estimators=50, random_state=45)),
        ('KNN', KNeighborsClassifier())
    ],
    voting='hard'
)

vot.fit(X_train, y_train)

In [None]:
train_prediction = vot.predict(X_train)
test_prediction = vot.predict(X_test)
print(f'train score: {classification_report(y_train, train_pred)}')
print(f'test score: {classification_report(y_test, test_pred)}')

train score:               precision    recall  f1-score   support

           0       0.98      0.24      0.39      8255
           1       0.57      0.99      0.73      8385

    accuracy                           0.62     16640
   macro avg       0.77      0.62      0.56     16640
weighted avg       0.77      0.62      0.56     16640

test score:               precision    recall  f1-score   support

           0       0.97      0.20      0.33      2132
           1       0.54      0.99      0.70      2028

    accuracy                           0.59      4160
   macro avg       0.75      0.60      0.52      4160
weighted avg       0.76      0.59      0.51      4160

