In [1]:
import nltk, warnings, string
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
warnings.filterwarnings('ignore')
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier, PassiveAggressiveClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, ExtraTreesClassifier, GradientBoostingClassifier, BaggingClassifier, VotingClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from wordcloud import WordCloud
import re
from sklearn.model_selection import train_test_split

In [3]:
df = pd.read_csv('Twitter_Data.csv')
df.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [4]:
df.dropna(inplace=True)

In [5]:
import re
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

from wordcloud import STOPWORDS
STOPWORDS.update(['rt', 'mkr', 'didn', 'bc', 'n', 'm', 'im', 'll', 'y', 've', 'u', 'ur', 'don', 't', 's'])


In [6]:
def lower(text):
    return text.lower()

def remove_twitter(text):
    return re.sub(TEXT_CLEANING_RE, ' ', text)

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

def cleantext(text):
    text = lower(text)
    text = remove_twitter(text)
    text = remove_stopwords(text)
    return text

In [7]:
df['clean_text'] = df['clean_text'].apply(cleantext)

In [8]:
lematizer=WordNetLemmatizer()

def lemmatizer_words(text):
    return " ".join([lematizer.lemmatize(word) for word in text.split()])

In [9]:
df['clean_text']=df['clean_text'].apply(lambda text: lemmatizer_words(text))

In [10]:
X = df.clean_text
y = df.category

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

In [12]:
tfidf = TfidfVectorizer()
X_train_vect = tfidf.fit_transform(X_train)
X_test_vect = tfidf.transform(X_test)

In [13]:
len(tfidf.vocabulary_)

80686

In [14]:
mnb = MultinomialNB()
mnb.fit(X_train_vect,y_train)

In [15]:
mnb_pred = mnb.predict(X_test_vect)
print(confusion_matrix(y_test,mnb_pred))
print(classification_report(y_test,mnb_pred))


[[ 1246   384  9061]
 [   47  5411 11186]
 [   34   367 21155]]
              precision    recall  f1-score   support

        -1.0       0.94      0.12      0.21     10691
         0.0       0.88      0.33      0.47     16644
         1.0       0.51      0.98      0.67     21556

    accuracy                           0.57     48891
   macro avg       0.78      0.47      0.45     48891
weighted avg       0.73      0.57      0.50     48891



In [16]:
lr = LogisticRegression()
lr.fit(X_train_vect,y_train)

In [17]:
lr_pred = lr.predict(X_test_vect)
print(confusion_matrix(y_test,lr_pred))
print(classification_report(y_test,lr_pred))

[[ 7951  1235  1505]
 [  278 15828   538]
 [  926  1557 19073]]
              precision    recall  f1-score   support

        -1.0       0.87      0.74      0.80     10691
         0.0       0.85      0.95      0.90     16644
         1.0       0.90      0.88      0.89     21556

    accuracy                           0.88     48891
   macro avg       0.87      0.86      0.86     48891
weighted avg       0.88      0.88      0.87     48891



In [19]:
from sklearn.tree import DecisionTreeClassifier

In [20]:
dt = DecisionTreeClassifier()
dt.fit(X_train_vect,y_train)

In [21]:
dt_pred = dt.predict(X_test_vect)
print(confusion_matrix(y_test,dt_pred))
print(classification_report(y_test,dt_pred))


[[ 7098  1244  2349]
 [  730 14767  1147]
 [ 2098  1752 17706]]
              precision    recall  f1-score   support

        -1.0       0.72      0.66      0.69     10691
         0.0       0.83      0.89      0.86     16644
         1.0       0.84      0.82      0.83     21556

    accuracy                           0.81     48891
   macro avg       0.79      0.79      0.79     48891
weighted avg       0.81      0.81      0.81     48891

