In [13]:
import matplotlib.pyplot as plt
import re
import os
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

import gensim

In [51]:
data = pd.read_csv("News_Final.csv")

In [52]:
data.info() 
#no further preprocessing for null values because of no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38258 entries, 0 to 38257
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    38258 non-null  object
 1   text     38258 non-null  object
 2   subject  38258 non-null  object
 3   date     38258 non-null  object
 4   Label    38258 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.5+ MB


In [53]:
# remove non alphabets
remove_non_alphabets = lambda x: re.sub(r'[^a-zA-Z]',' ',x)

# tokenn alphabets-only list
tokenize = lambda x: word_tokenize(x)

# assign ps to a lambda function to run on each line of value
ps = PorterStemmer()
stem = lambda w: [ ps.stem(x) for x in w ]

# assign lemmatizer to a lambda function to run on each line of value
lemmatizer = WordNetLemmatizer()
leammtizer = lambda x: [ lemmatizer.lemmatize(word) for word in x ]

In [55]:
# apply all above methods to the column ''text
print('Processing : [=', end='')
data['text'] = data['text'].apply(remove_non_alphabets)
print('=', end='')
data['text'] = data['text'].apply(tokenize)
print('=', end='')
data['text'] = data['text'].apply(stem)
print('=', end='')
data['text'] = data['text'].apply(leammtizer)
print('=', end='')
data['text'] = data['text'].apply(lambda x: ' '.join(x))
print('] : Completed', end='')
data.head()

Processing : [=====] : Completed

Unnamed: 0,title,text,subject,date,Label
0,FLASHBACK: KING OBAMA COMMUTES SENTENCES OF 22...,just make room for hillari presid obama today ...,politics,31-Mar-15,1
1,APPLE’S CEO SAYS RELIGIOUS FREEDOM LAWS ARE ‘D...,the gay mafia ha a new corpor don thi is the o...,politics,31-Mar-15,1
2,WATCH DIRTY HARRY REID ON HIS LIE ABOUT ROMNEY...,In case you miss it sen harri reid R NV who an...,politics,31-Mar-15,1
3,OH NO! GUESS WHO FUNDED THE SHRINE TO TED KENNEDY,noth like polit cronyism to make your stomach ...,politics,31-Mar-15,1
4,BENGHAZI PANEL CALLS HILLARY TO TESTIFY UNDER ...,doe anyon realli think hillari clinton will co...,politics,31-Mar-15,1


In [56]:
# apply all above methods to the column ''title
print('Processing : [=', end='')
data['title'] = data['title'].apply(remove_non_alphabets)
print('=', end='')
data['title'] = data['title'].apply(tokenize)
print('=', end='')
data['title'] = data['title'].apply(stem)
print('=', end='')
data['title'] = data['title'].apply(leammtizer)
print('=', end='')
data['title'] = data['title'].apply(lambda x: ' '.join(x))
print('] : Completed', end='')
data.head()

Processing : [=====] : Completed

Unnamed: 0,title,text,subject,date,Label
0,flashback king obama commut sentenc OF drug de...,just make room for hillari presid obama today ...,politics,31-Mar-15,1
1,appl S ceo say religi freedom law are danger T...,the gay mafia ha a new corpor don thi is the o...,politics,31-Mar-15,1
2,watch dirti harri reid ON hi lie about romney ...,In case you miss it sen harri reid R NV who an...,politics,31-Mar-15,1
3,OH NO guess who fund the shrine TO ted kennedi,noth like polit cronyism to make your stomach ...,politics,31-Mar-15,1
4,benghazi panel call hillari TO testifi under o...,doe anyon realli think hillari clinton will co...,politics,31-Mar-15,1


In [57]:
# split to 30 percent test data and 70 percent train data
# labels can be seen as y, an dependent variable
train_corpus, test_corpus, train_labels, test_labels = train_test_split(data["text"],
                                                                        data["Label"],
                                                                        test_size=0.3)

In [59]:
# build bag of words features' vectorizer and get features
bow_vectorizer=CountVectorizer(min_df=1, ngram_range=(1,1))
bow_train_features = bow_vectorizer.fit_transform(train_corpus)
bow_test_features = bow_vectorizer.transform(test_corpus)

In [60]:
# build tfidf features' vectorizer and get features
tfidf_vectorizer=TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
tfidf_train_features = tfidf_vectorizer.fit_transform(train_corpus)  
tfidf_test_features = tfidf_vectorizer.transform(test_corpus)    

In [63]:
# define a function to evaluate our classification models based on four metrics
# This defined function is also useful in other cases. This is comparing test_y and pred_y. 
# Both contain 1s and 0s.
def get_metrics(true_labels, predicted_labels):
    metrics_dict = dict(zip(["accuracy", "precision", "recall", "f1"], [None]*4))
    #metrics_dict = {i:None for i in ["accuracy", "precision", "recall", "f1"]}
    for m in metrics_dict.keys():
        exec('''metrics_dict['{}'] = np.round(                                                    
                        metrics.{}_score(true_labels, 
                                               predicted_labels),
                        2)'''.format(m, m))
    return metrics_dict

In [66]:
# define a function that trains the model, performs predictions and evaluates the predictions
def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    # evaluate model prediction performance   
    '''get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)'''
    print(metrics.classification_report(test_labels,predictions))
    return predictions, get_metrics(true_labels=test_labels, predicted_labels=predictions) 

In [67]:
from sklearn.ensemble import RandomForestClassifier # import random forest

In [68]:
# assign random forest function to an object
rf = RandomForestClassifier(criterion="entropy")

# predict and evaluate random forest
rf_bow_predictions, rf_bow_metrics = train_predict_evaluate_model(classifier=rf,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      6256
           1       0.99      0.97      0.98      5222

    accuracy                           0.98     11478
   macro avg       0.98      0.98      0.98     11478
weighted avg       0.98      0.98      0.98     11478



In [74]:
# predict and evaluate random forest
rf_tfidf_predictions, rf_tfidf_metrics = train_predict_evaluate_model(classifier=rf,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      6256
           1       0.99      0.97      0.98      5222

    accuracy                           0.98     11478
   macro avg       0.98      0.98      0.98     11478
weighted avg       0.98      0.98      0.98     11478



In [70]:
# predict and evaluate random forest
rf_avgwv_predictions, rf_avgwv_metrics = train_predict_evaluate_model(classifier=rf,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.97      0.99      0.98      6256
           1       0.99      0.97      0.98      5222

    accuracy                           0.98     11478
   macro avg       0.98      0.98      0.98     11478
weighted avg       0.98      0.98      0.98     11478



In [76]:
# create a dictionary that stores all the accuracy information
performance_dict = {}

for me in ["accuracy", "precision", "recall", "f1"]:
    performance_dict[me] = {}
    for m in ["rf"]:
        performance_dict[me][m] = {}
        for f in ["bow","tfidf","avgwv"]:
            exec('performance_dict["{}"]["{}"]["{}"] = {}_{}_metrics["{}"]'.format(me, m, f, m, f, me))
        
#Accuracy Matrix
print("\n\033[1;31mAccuracy Metrix\n\033[0m")
print(pd.DataFrame(performance_dict["accuracy"]).rename(columns={ 
                                            "rf":"Random Forest"}, 
                                   index={"bow":"Bag-of-words", 
                                          "tfidf":"TFIDF", 
                                          "avgwv":"Word2Vec"}))

#Precision Matrix
print("\n\033[1;31mPrecision Metrix\n\033[0m")
print(pd.DataFrame(performance_dict["precision"]).rename(columns={ 
                                            "rf":"Random Forest"}, 
                                   index={"bow":"Bag-of-words", 
                                          "tfidf":"TFIDF", 
                                          "avgwv":"Word2Vec"}))

#Recall Matrix
print("\n\033[1;31mRecall Metrix\n\033[0m")
print(pd.DataFrame(performance_dict["recall"]).rename(columns={ 
                                            "rf":"Random Forest"}, 
                                   index={"bow":"Bag-of-words", 
                                          "tfidf":"TFIDF", 
                                          "avgwv":"Word2Vec"}))

#F1 Score Matrix
print("\n\033[1;31mF1 Score Metrix\n\033[0m")
print(pd.DataFrame(performance_dict["f1"]).rename(columns={ 
                                            "rf":"Random Forest"}, 
                                   index={"bow":"Bag-of-words", 
                                          "tfidf":"TFIDF", 
                                          "avgwv":"Word2Vec"}))


[1;31mAccuracy Metrix
[0m
              Random Forest
Word2Vec               0.98
Bag-of-words           0.98
TFIDF                  0.98

[1;31mPrecision Metrix
[0m
              Random Forest
Word2Vec               0.99
Bag-of-words           0.99
TFIDF                  0.99

[1;31mRecall Metrix
[0m
              Random Forest
Word2Vec               0.97
Bag-of-words           0.97
TFIDF                  0.97

[1;31mF1 Score Metrix
[0m
              Random Forest
Word2Vec               0.98
Bag-of-words           0.98
TFIDF                  0.98
