In [None]:
import pandas as pd
import numpy as np
import seaborn as sb
import nltk
import re
# Import word_tokenize and stopwords from nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer, TfidfVectorizer

import nltk
import nltk.corpus
from gensim.models.word2vec import Word2Vec
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm
from sklearn.svm import LinearSVC

from sklearn.metrics import confusion_matrix, f1_score, classification_report

In [9]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Conv1D, MaxPool1D
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
test = pd.read_csv('test.csv')
train = pd.read_csv('train.csv')

In [3]:
# Shape of each set

print("Shape of TRAIN DATA: ", train.shape)
print("Shape of TEST DATA: ", test.shape)

Shape of TRAIN DATA:  (10240, 2)
Shape of TEST DATA:  (2551, 2)


In [21]:
test.head()

Unnamed: 0,Statement,Label
0,Building a wall on the U.S.-Mexico border will...,True
1,Wisconsin is on pace to double the number of l...,False
2,Says John McCain has done nothing to help the ...,False
3,Suzanne Bonamici supports a plan that will cut...,True
4,When asked by a reporter whether hes at the ce...,False


In [5]:
train.head()

Unnamed: 0,Statement,Label
0,Says the Annies List political group supports ...,False
1,When did the decline of coal start? It started...,True
2,"Hillary Clinton agrees with John McCain ""by vo...",True
3,Health care reform legislation is likely to ma...,False
4,The economic turnaround started at the end of ...,True


In [6]:
# Saving the original Data for future

train_orig = train.copy()
test_orig = test.copy()

In [3]:
def process_statements(text):
    
    # Tokenize the words
    tokenized = word_tokenize(text)

    # Remove the stop words
    tokenized = [token for token in tokenized if token not in stopwords.words("english")] 

    # Lemmatize the words, changing text to lowercase
    lemmatizer = WordNetLemmatizer()
    tokenized = [lemmatizer.lemmatize(token.lower(), pos='a') for token in tokenized]

    # Remove non-alphabetic characters and keep the words contains three or more letters
    tokenized = [token for token in tokenized if token.isalpha() and len(token)>2]
    
    return tokenized
    
# Call the function and store the result into a new column
#train["Processed"] = train['Statement'].str.lower().apply(process_statements)

# Print the first fifteen rows of Processed
#display(train[["Processed"]].head(15))

In [23]:
#def tokenize(self, tokenizer=nltk.word_tokenize):
        def tokenize_row(row):
            row["text"] = tokenizer(row["text"])
            row["tokenized_text"] = [] + row["text"]
            return row

In [16]:
train.head()

Unnamed: 0,Statement,Label,Processed
0,Says the Annies List political group supports ...,False,"[says, annies, list, political, group, support..."
1,When did the decline of coal start? It started...,True,"[decline, coal, start, started, natural, gas, ..."
2,"Hillary Clinton agrees with John McCain ""by vo...",True,"[hillary, clinton, agrees, john, mccain, votin..."
3,Health care reform legislation is likely to ma...,False,"[health, care, reform, legislation, likely, ma..."
4,The economic turnaround started at the end of ...,True,"[economic, turnaround, started, end, term]"


In [4]:
#from sklearn.model_selection import train_test_split
#Train test split
#X_train, X_test, y_train, y_test = train_test_split(train["Statement"], train["Label"], test_size=0.25)

In [4]:
X_train = train["Statement"]
X_test = test["Statement"]
y_train = train["Label"]
y_test = test["Label"]

In [5]:
# Vectorize the text so the models can actually interpret it
vectorizer = TfidfVectorizer(analyzer=process_statements)
x_train_vectorized = vectorizer.fit_transform(X_train)
x_test_vectorized = vectorizer.transform(X_test)

In [6]:
print (x_train_vectorized.shape)
print (x_test_vectorized.shape)
print (y_train.shape)
print (y_test.shape)

(10240, 11179)
(2551, 11179)
(10240,)
(2551,)


In [7]:
log_reg = LogisticRegression()
log_reg.fit(x_train_vectorized, y_train)
logR_predicted = log_reg.predict(x_test_vectorized)
print(f"{log_reg.__class__.__name__} accuracy: {log_reg.score(x_test_vectorized, y_test)}")

LogisticRegression accuracy: 0.6095648765190121


In [8]:
print(classification_report(y_test, logR_predicted))
print('\n')
print(confusion_matrix(y_test, logR_predicted))

              precision    recall  f1-score   support

       False       0.60      0.46      0.52      1169
        True       0.62      0.74      0.67      1382

    accuracy                           0.61      2551
   macro avg       0.61      0.60      0.60      2551
weighted avg       0.61      0.61      0.60      2551



[[ 537  632]
 [ 364 1018]]


In [12]:
random_forest = RandomForestClassifier(n_estimators=10, max_depth=100)
random_forest.fit(x_train_vectorized, y_train)

print(f"{random_forest.__class__.__name__} accuracy: {random_forest.score(x_test_vectorized, y_test)}")

RandomForestClassifier accuracy: 0.587890625


In [14]:
rForest_pred = random_forest.predict(x_test_vectorized)
print(classification_report(y_test, rForest_pred))
print('\n')
print(confusion_matrix(y_test, rForest_pred))

              precision    recall  f1-score   support

       False       0.51      0.41      0.45      1065
        True       0.63      0.72      0.67      1495

    accuracy                           0.59      2560
   macro avg       0.57      0.56      0.56      2560
weighted avg       0.58      0.59      0.58      2560



[[ 434  631]
 [ 424 1071]]


In [20]:
multi_nb = MultinomialNB()
multi_nb.fit(x_train_vectorized, y_train)
print(f"{multi_nb.__class__.__name__} accuracy: {multi_nb.score(x_test_vectorized, y_test)}")

MultinomialNB accuracy: 0.6203125


In [23]:
NaiveB_pred = multi_nb.predict(x_test_vectorized)
print(classification_report(y_test, NaiveB_pred))
print('\n')
print(confusion_matrix(y_test, NaiveB_pred))

              precision    recall  f1-score   support

       False       0.59      0.29      0.39      1065
        True       0.63      0.85      0.72      1495

    accuracy                           0.62      2560
   macro avg       0.61      0.57      0.56      2560
weighted avg       0.61      0.62      0.59      2560



[[ 312  753]
 [ 219 1276]]


In [42]:
svm = svm.LinearSVC()
svm.fit(x_train_vectorized, y_train)
print(f"{svm.__class__.__name__} accuracy: {svm.score(x_test_vectorized, y_test)}")

LinearSVC accuracy: 0.58515625


In [44]:
svm_pred = svm.predict(x_test_vectorized)

print(classification_report(y_test, svm_pred))
print('\n')
print(confusion_matrix(y_test, svm_pred))

              precision    recall  f1-score   support

       False       0.50      0.48      0.49      1065
        True       0.64      0.66      0.65      1495

    accuracy                           0.59      2560
   macro avg       0.57      0.57      0.57      2560
weighted avg       0.58      0.59      0.58      2560



[[509 556]
 [506 989]]


# LSTM  (To Be Done)

In [28]:
MAX_NB_WORDS = 100000 # max number of words for tokenizer
MAX_SEQUENCE_LENGTH = 1000 # max length of each sentences, including padding
VALIDATION_SPLIT = 0.2 # 20% of data for validation (not used in training)
EMBEDDING_DIM = 100 # embedding dimensions for word vectors

#Creating Word Vectors by Word2Vec Method (takes time...)
#w2v_model = Word2Vec(sentences=X, size=EMBEDDING_DIM, window=5, min_count=1)

In [32]:
y_train = train["Label"].values

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(train['Statement'])
sequences = tokenizer.texts_to_sequences(train['Statement'])
word_index = tokenizer.word_index
print("Vocabulary size:", len(word_index))

Vocabulary size: 12408


Padding
Now we are going to add padding to our data to make it uniform. Keras makes it easy to pad our data by using pad_sequences function.

In [50]:
X_train = pad_sequences(sequences,maxlen = 1000, padding='post', truncating='post')

In [None]:
test_sequences = tokenizer.texts_to_sequences(X_test)
X_test = pad_sequences(test_sequences,maxlen = 1000,padding='post', truncating='post')