<a href="https://colab.research.google.com/github/till-tomorrow/sentiment-analysis-of-tweets/blob/main/TMLC_assignment_for_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [39]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

dataset = pd.read_csv('Twitter_Data.csv')

dataset.shape

(162980, 2)

In [40]:
# considering only 10 percent data just to get the idea of the model performance. This could be wrong as the data could be biased but I am taking it considering the time and machine constraints
dataset=dataset.head(16300)

In [41]:
# displaying top rows to get the feel of data
dataset.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [42]:
# analyzing the categories and the number of entries per category
dataset['category'].value_counts()

 1.0    6756
 0.0    5641
-1.0    3903
Name: category, dtype: int64

In [43]:
dataset.describe()

Unnamed: 0,category
count,16300.0
mean,0.175031
std,0.789512
min,-1.0
25%,0.0
50%,0.0
75%,1.0
max,1.0


In [44]:
# printing rows containing NaN
is_NaN = dataset.isnull()
row_has_NaN = is_NaN.any(axis=1)
rows_with_NaN = dataset[row_has_NaN]

print(rows_with_NaN)

    clean_text  category
148        NaN       0.0


In [45]:
# since the above row 148 doesn't contain the tweet, it is better to drop the row
# if there were a tweet, then insted of dropping the row, I would have manually classified them
dataset=dataset.dropna()

In [46]:
dataset = dataset.reset_index()
dataset.shape # just to be sure that the dropna() worked properly

(16299, 3)

In [47]:
# since there are 3 columns now so displaying what is the new column
dataset.columns

Index(['index', 'clean_text', 'category'], dtype='object')

In [50]:
dataset =dataset.drop(['index'], axis = 1)

In [51]:
dataset.shape # finally, back to normal!!

(16299, 2)

In [52]:
# tokenise
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []

num_data_points = 16299
for i in range(0, num_data_points):
  a_tweet = re.sub("[^a-zA-Z]", " ", dataset['clean_text'][i])
  a_tweet = a_tweet.lower()
  a_tweet = a_tweet.split()
  ps = PorterStemmer()
  all_stopwords = stopwords.words('english')
  all_stopwords.remove('not')
  a_tweet = [ps.stem(word) for word in a_tweet if not word in set(all_stopwords)]
  a_tweet = ' '.join(a_tweet)
  corpus.append(a_tweet)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
# bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(corpus).toarray()
y = dataset.iloc[:, 1].values
print(X.shape)

(16299, 19049)


In [54]:
# considering only about 10 percent of the total words from the corpus
cv = CountVectorizer(max_features = 1905)
X = cv.fit_transform(corpus).toarray()

In [55]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20)

In [56]:
# classification using naive bayes algo
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [57]:
y_pred = classifier.predict(X_test)

In [58]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.5665644171779141

In [59]:
# classification using knn
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 7)
classifier.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=7, p=2,
                     weights='uniform')

In [60]:
y_pred = classifier.predict(X_test)

In [61]:
accuracy_score(y_test, y_pred)

0.41595092024539876

In [62]:
# classification using randon forest
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 15, criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='entropy', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=15,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [63]:
y_pred = classifier.predict(X_test)

In [64]:
accuracy_score(y_test, y_pred)

0.7656441717791411

In [65]:
# classification using decision tree
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='entropy',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')

In [66]:
y_pred = classifier.predict(X_test)

In [67]:
accuracy_score(y_test, y_pred)

0.7263803680981595