In [None]:
import numpy as np
import pandas as pd

import os
import re
import nltk

data = pd.read_csv('dataframe.csv')

In [None]:
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('omw-1.4')

In [None]:
data['total'] = data['title'] + data['text']

In [None]:
%%time

from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()


for index, row in data.iterrows():
    filter_sentence = ''
    
    sentence = row['total']
    sentence = re.sub(r'[^\w\s]', '', sentence)
    words = nltk.word_tokenize(sentence)

    for w in words:
        filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(w)).lower()

        data.loc[index, 'total'] = filter_sentence

data = data[['total','label']]

In [None]:
data.head(20)

In [None]:
X = data['total']
Y = data['label']

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33)

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
#Tranforming data to value using CountVectorizer

count_vectorizer = CountVectorizer(ngram_range=(1, 2))
count_vectorizer.fit(x_train)
X_freqMatrix = count_vectorizer.transform(x_train)

In [None]:
#Applying TFIDF to result obtained after Counvectorizer

tfidf = TfidfTransformer(norm='l2')
x_train = tfidf.fit_transform(X_freqMatrix)

In [None]:
#Applying feature extraction to test data too

X_test_freqMatrix = count_vectorizer.transform(x_test)
x_test = tfidf.transform(X_test_freqMatrix)

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

### Logistic Regression

In [None]:
%%time

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

logreg = LogisticRegression(C=1e5, solver='lbfgs', max_iter=1000000)
logreg.fit(x_train, y_train)
predicted = logreg.predict(x_test)
print( "Accuracy Percentage {:.2f}".format(logreg.score(x_test, y_test)) )
cm = confusion_matrix(y_test, predicted)
cm

### MultiNomial Naive Bayes

In [None]:
%%time

from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
nb.fit(x_train, y_train)
predicted = nb.predict(x_test)
print( "Accuracy Percentage {:.2f}".format(nb.score(x_test, y_test)) )
cm = confusion_matrix(y_test, predicted)
cm

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(x_train, y_train)
predicted = rf.predict(x_test)
print( "Accuracy Percentage {:.2f}".format(rf.score(x_test, y_test)) )
cm = confusion_matrix(y_test, predicted)
cm

### Pipeline

In [None]:
data = pd.read_csv('dataframe.csv')

from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
train, test = train_test_split(data, test_size=0.3)

train['total'] = train['title'] + train['text']
test['total'] = test['title'] + test['text']

for index, row in train.iterrows():
    filter_sentence = ''
    
    sentence = row['total']
    sentence = re.sub(r'[^\w\s]', '', sentence)
    words = nltk.word_tokenize(sentence)

    for w in words:
        filter_sentence = filter_sentence + ' ' + str(lemmatizer.lemmatize(w)).lower()

        train.loc[index, 'total'] = filter_sentence

train = train[['total','label']]

X_train = train['total']
Y_train = train['label']

In [None]:
from sklearn.pipeline import Pipeline
import joblib

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer(norm='l2')),
    ('nb', MultinomialNB())
])


pipeline.fit(X_train, Y_train)

In [None]:
#saving the pipeline

filename = 'pipeline.sav'
joblib.dump(pipeline, filename)

In [None]:
filename = './pipeline.sav'

loaded_model = joblib.load(filename)
result = loaded_model.predict(["flynn hillary clinton big woman campus breitbart daniel j flynnever get feeling life circle roundabout rather head straight line toward intended destination hillary clinton remains big woman campus leafy liberal wellesley massachusetts everywhere else vote likely inauguration dress remainder day way miss havisham forever wore wedding dress speaking great expectations hillary rodham overflowed 48 year ago first addressed wellesley graduating class the president college informed gathered 1969 student needed debate far i could ascertain spokesman kind like democratic primary 2016 minus term unknown even seven sisters school i glad miss adams made clear i speaking today u 400 u miss rodham told classmate after appointing edger bergen charlie mccarthys mortimer snerds attendance bespectacled granny glass awarding matronly wisdom least john lennon wisdom took issue previous speaker despite becoming first win election seat u s senate since reconstruction edward brooke came criticism calling empathy goal protestors criticized tactic though clinton senior thesis saul alinsky lamented black power demagogue elitist arrogance repressive intolerance within new left similar word coming republican necessitated brief rebuttal trust rodham ironically observed 1969 one word i asked class rehearsal wanted say everyone came said talk trust talk lack trust u way feel others talk trust bust what say what say feeling permeates generation perhaps even understood distrusted the trust bust certainly busted clintons 2016 plan she certainly even understand people distrusted after whitewater travelgate vast conspiracy benghazi missing email clinton found distrusted voice friday there load compromising road broadening political horizon and distrust american people trump edged 48 percent 38 percent question immediately prior novembers election stood major reason closing horizon clinton described vanquisher supporter embracing lie con alternative fact assault truth reason she failed explain american people chose lie truth as history major among today know well people power invent fact attack question mark beginning end free society offered that hyperbole like many people emerge 1960s hillary clinton embarked upon long strange trip from high school goldwater girl wellesley college republican president democratic politician clinton drank time place gave degree more significantly went idealist cynic comparison two wellesley commencement address show way back lamented long leader viewed politics art possible challenge practice politics art making appears impossible possible now big woman campus odd woman white house wonder current station even possible why arent i 50 point ahead asked september in may asks isnt president the woman famously dubbed congenital liar bill safire concludes lie mind getting stood election day like finding jilted bride wedding day inspires dangerous delusion"])
print(result)