In [2]:
pip install numpy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import matplotlib.pyplot as plt
import re
import os
import pandas as pd
import numpy as np

import nltk
from nltk.tokenize import word_tokenize
from nltk import PorterStemmer
from nltk import WordNetLemmatizer
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

import gensim



In [4]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\chris\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
data = pd.read_csv("News_Final.csv")

In [6]:
data.info() 
#no further preprocessing for null values because of no null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38258 entries, 0 to 38257
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    38258 non-null  object
 1   text     38258 non-null  object
 2   subject  38258 non-null  object
 3   date     38258 non-null  object
 4   Label    38258 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.5+ MB


#### Preprocessing

In [7]:
# remove non alphabets
remove_non_alphabets = lambda x: re.sub(r'[^a-zA-Z]',' ',x)

# token alphabets-only list
tokenize = lambda x: word_tokenize(x)

# assign ps to a lambda function to run on each line of value
ps = PorterStemmer()
stem = lambda w: [ ps.stem(x) for x in w ]

# assign lemmatizer to a lambda function to run on each line of value
lemmatizer = WordNetLemmatizer()
leammtizer = lambda x: [ lemmatizer.lemmatize(word) for word in x ]

In [8]:
# apply all above methods to the column ''text
print('Processing : [=', end='')
data['text'] = data['text'].apply(remove_non_alphabets)
print('=', end='')
data['text'] = data['text'].apply(tokenize)
print('=', end='')
data['text'] = data['text'].apply(stem)
print('=', end='')
data['text'] = data['text'].apply(leammtizer)
print('=', end='')
data['text'] = data['text'].apply(lambda x: ' '.join(x))
print('] : Completed', end='')
data.head()

Processing : [=====] : Completed

Unnamed: 0,title,text,subject,date,Label
0,FLASHBACK: KING OBAMA COMMUTES SENTENCES OF 22...,just make room for hillari presid obama today ...,politics,31-Mar-15,1
1,APPLE’S CEO SAYS RELIGIOUS FREEDOM LAWS ARE ‘D...,the gay mafia ha a new corpor don thi is the o...,politics,31-Mar-15,1
2,WATCH DIRTY HARRY REID ON HIS LIE ABOUT ROMNEY...,In case you miss it sen harri reid R NV who an...,politics,31-Mar-15,1
3,OH NO! GUESS WHO FUNDED THE SHRINE TO TED KENNEDY,noth like polit cronyism to make your stomach ...,politics,31-Mar-15,1
4,BENGHAZI PANEL CALLS HILLARY TO TESTIFY UNDER ...,doe anyon realli think hillari clinton will co...,politics,31-Mar-15,1


In [9]:
# apply all above methods to the column ''title
print('Processing : [=', end='')
data['title'] = data['title'].apply(remove_non_alphabets)
print('=', end='')
data['title'] = data['title'].apply(tokenize)
print('=', end='')
data['title'] = data['title'].apply(stem)
print('=', end='')
data['title'] = data['title'].apply(leammtizer)
print('=', end='')
data['title'] = data['title'].apply(lambda x: ' '.join(x))
print('] : Completed', end='')
data.head()

Processing : [=====] : Completed

Unnamed: 0,title,text,subject,date,Label
0,flashback king obama commut sentenc OF drug de...,just make room for hillari presid obama today ...,politics,31-Mar-15,1
1,appl S ceo say religi freedom law are danger T...,the gay mafia ha a new corpor don thi is the o...,politics,31-Mar-15,1
2,watch dirti harri reid ON hi lie about romney ...,In case you miss it sen harri reid R NV who an...,politics,31-Mar-15,1
3,OH NO guess who fund the shrine TO ted kennedi,noth like polit cronyism to make your stomach ...,politics,31-Mar-15,1
4,benghazi panel call hillari TO testifi under o...,doe anyon realli think hillari clinton will co...,politics,31-Mar-15,1


#### Split Train-Test Sets

In [13]:
# split to 30 percent test data and 70 percent train data
# labels can be seen as y, an dependent variable
train_corpus, test_corpus, train_labels, test_labels = train_test_split(data["text"],
                                                                        data["Label"],
                                                                        test_size=0.3)

#### Prepare for Machine Learning

In [14]:
# build bag of words features' vectorizer and get features
bow_vectorizer=CountVectorizer(min_df=1, ngram_range=(1,1))
bow_train_features = bow_vectorizer.fit_transform(train_corpus)
bow_test_features = bow_vectorizer.transform(test_corpus)

In [15]:
# build tfidf features' vectorizer and get features
tfidf_vectorizer=TfidfVectorizer(min_df=1, 
                                 norm='l2',
                                 smooth_idf=True,
                                 use_idf=True,
                                 ngram_range=(1,1))
tfidf_train_features = tfidf_vectorizer.fit_transform(train_corpus)  
tfidf_test_features = tfidf_vectorizer.transform(test_corpus)    

In [16]:
# tokenize documents for word2vec
tokenized_train = [nltk.word_tokenize(text)
                   for text in train_corpus]
tokenized_test = [nltk.word_tokenize(text)
                   for text in test_corpus]  

# build word2vec model                   
wv_model = gensim.models.Word2Vec(tokenized_train, vector_size=200, window=60, min_count=10)
#set the size or dimension for the word vectors
#specify the length of the window of words taken as context 
#ignores all words with total frequency lower than 10

def average_word_vectors(words, model, vocabulary, num_features):
    
    feature_vector = np.zeros((num_features,),dtype="float64")
    nwords = 0.
    
    for word in words:
        if word in vocabulary: 
            nwords = nwords + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    
    if nwords:
        feature_vector = np.divide(feature_vector, nwords)
        
    return feature_vector 
   

def averaged_word_vectorizer(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features)
                    for tokenized_sentence in corpus]
    return np.array(features)

# averaged word vector features from word2vec
avg_wv_train_features = averaged_word_vectorizer(corpus=tokenized_train,
                                                 model=wv_model,
                                                 num_features=200)                   
avg_wv_test_features = averaged_word_vectorizer(corpus=tokenized_test,
                                                model=wv_model,
                                                num_features=200) 

#### Define Evaluation Function

In [17]:
# define a function to evaluate our classification models based on four metrics
# This defined function is also useful in other cases. This is comparing test_y and pred_y. 
# Both contain 1s and 0s.
def get_metrics(true_labels, predicted_labels):
    metrics_dict = dict(zip(["accuracy", "precision", "recall", "f1"], [None]*4))
    #metrics_dict = {i:None for i in ["accuracy", "precision", "recall", "f1"]}
    for m in metrics_dict.keys():
        exec('''metrics_dict['{}'] = np.round(                                                    
                        metrics.{}_score(true_labels, 
                                               predicted_labels),
                        2)'''.format(m, m))
    return metrics_dict

#### Define Easy to Use Function for Train/Test/Evaluate

In [18]:
# define a function that trains the model, performs predictions and evaluates the predictions
def train_predict_evaluate_model(classifier, 
                                 train_features, train_labels, 
                                 test_features, test_labels):
    # build model    
    classifier.fit(train_features, train_labels)
    # predict using model
    predictions = classifier.predict(test_features) 
    # evaluate model prediction performance   
    '''get_metrics(true_labels=test_labels, 
                predicted_labels=predictions)'''
    print(metrics.classification_report(test_labels,predictions))
    return predictions, get_metrics(true_labels=test_labels, predicted_labels=predictions)   

#### import classifiers

In [19]:
from sklearn.naive_bayes import GaussianNB 
from sklearn.naive_bayes import MultinomialNB # import naive bayes
from sklearn.tree import DecisionTreeClassifier # import Decision Tree
from sklearn.ensemble import RandomForestClassifier # import random forest

#### train and test on BOW features 

In [20]:
# assign naive bayes function to an object
mnb = MultinomialNB()
mnb2 = GaussianNB()

# predict and evaluate naive bayes
mnb_bow_predictions, mnb_bow_metrics = train_predict_evaluate_model(classifier=mnb,
                                           train_features=bow_train_features,
                                           train_labels=train_labels,
                                           test_features=bow_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.95      0.96      0.95      6148
           1       0.95      0.94      0.95      5330

    accuracy                           0.95     11478
   macro avg       0.95      0.95      0.95     11478
weighted avg       0.95      0.95      0.95     11478



#### Train and Test on TFIDF features

In [21]:
# predict and evaluate naive bayes
mnb_tfidf_predictions, mnb_tfidf_metrics = train_predict_evaluate_model(classifier=mnb,
                                           train_features=tfidf_train_features,
                                           train_labels=train_labels,
                                           test_features=tfidf_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.91      0.96      0.94      6148
           1       0.96      0.90      0.93      5330

    accuracy                           0.93     11478
   macro avg       0.94      0.93      0.93     11478
weighted avg       0.93      0.93      0.93     11478



#### train and Test on Word2Vec features

In [22]:
# predict and evaluate naive bayes
mnb_avgwv_predictions, mnb_avgwv_metrics = train_predict_evaluate_model(classifier=mnb2,
                                           train_features=avg_wv_train_features,
                                           train_labels=train_labels,
                                           test_features=avg_wv_test_features,
                                           test_labels=test_labels)

              precision    recall  f1-score   support

           0       0.92      0.94      0.93      6148
           1       0.93      0.91      0.92      5330

    accuracy                           0.93     11478
   macro avg       0.93      0.92      0.92     11478
weighted avg       0.93      0.93      0.93     11478

