# Data Science Final Project


For my final project, I chose to utilize Natural Language Processing (NLP) for sentiment analysis.

In [1]:
# install packages (for use on a differect computer)
# !pip install --user --upgrade nltk
# !pip install --user --upgrade keras
# !pip install --user --upgrade tensorflow
# !pip install --user --upgrade textblob
# !pip install --user --upgrade tqdm
# !pip install --user --upgrade time

# for progress bars
from tqdm import tqdm
import time

from matplotlib import pyplot
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

from datetime import datetime
import time
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import gc
from dateutil import parser
import string
import keras
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from textblob import TextBlob
from sklearn.linear_model import SGDClassifier
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import SpatialDropout1D
from keras.layers import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.callbacks import EarlyStopping
from keras.utils import pad_sequences

plt.rcParams.update(plt.rcParamsDefault)
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.options.display.max_colwidth = 1000
tqdm.pandas()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tdepa\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Here are some of the references that I used (for downloading the data and with code help)

https://www.kaggle.com/datasets/kazanova/sentiment140

https://www.analyticsvidhya.com/blog/2021/06/twitter-sentiment-analysis-a-nlp-use-case-for-beginners/

https://www.kaggle.com/datasets/columbine/imdb-dataset-sentiment-analysis-in-csv-format

https://www.kaggle.com/datasets/gpreda/covid19-tweets

## Importing Dataset

Here, I import the dataset containing the pre-analyzed tweets. I fix the dataframe a little to work better with the machine learning model. I also shuffle the dataset, as it is originally sorted with all positives first and then negative tweets.

In [2]:
df = pd.read_csv('twitterdata.csv', encoding = "ISO-8859-1")
print('Loaded file.')
df = pd.DataFrame(np.vstack([df.columns, df])) # Moves column names into row 1
df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'tweet'] # Renames columns
df.replace({'sentiment': {4: 1}}, inplace=True) # Replaces all '4's with '1's in column 'sentiment'
df.replace({'sentiment': {0: -1}}, inplace=True) # Replaces all '0's with '-1's in column 'sentiment'
data_types_dict = {
    'sentiment': int,
    'id': float
}
df = df.astype(data_types_dict) # changes value types
df.dtypes

Loaded file.


sentiment      int32
id           float64
date          object
query         object
user          object
tweet         object
dtype: object

In [3]:
df = df.sample(frac=1).reset_index(drop=True) # shuffles rows so its not all 1 and then -1
df = df[['sentiment', 'tweet']]
df.head()

Unnamed: 0,sentiment,tweet
0,-1,Just in newry...me and cat are going to buy a tent! Summer 2008 people! Its raining now though.
1,1,"@zey_rochelle The one we love/like may not always be the right one for us.Time heals wonds,pain,and sorrow"
2,1,Finally got rid of the Smart Board tools automatic launch on login on my Mac Leopard.
3,-1,Penultimate Pushing Daisies
4,-1,@jeramyer haha nope aint gonna work as we hav a bday list up so they no it aint my bday


In [4]:
s = df.stack().value_counts()
print(s)

1                                                                                                                                800000
-1                                                                                                                               799999
isPlayer Has Died! Sorry                                                                                                            210
good morning                                                                                                                        118
headache                                                                                                                            115
                                                                                                                                  ...  
mmm 2 hrs left at work then bed think me is coming down with a bug                                                                    1
@SelanneGirl Saku Koivu in Anaheim    YES!!!   O

Here, since the dataset is very large, I can limit the amount of rows used to run faster while testing as `cutoff`

In [5]:
cutoff = 5000
content = df['tweet'][:cutoff].values.tolist()
labels = df['sentiment'][:cutoff].values.tolist()
content = [x.strip() for x in content] # Deletes white space before and after
content[:20]

['Just in newry...me and cat are going to buy a tent! Summer 2008 people! Its raining now though.',
 '@zey_rochelle The one we love/like may not always be the right one for us.Time heals wonds,pain,and sorrow',
 'Finally got rid of the Smart Board tools automatic launch on login on my Mac Leopard.',
 'Penultimate Pushing Daisies',
 '@jeramyer haha nope aint gonna work as we hav a bday list up so they no it aint my bday',
 'Missing out on Manhattanhenge',
 "@ddlovato OMG DEMI YOU ARE AWESOME! YOU ROCK!I CAN'T WAIT FOR YOUR NEW CD.. THE COVER SEEMS SO COOL  I LOVE U SO MUCH!",
 '@PawPrintsMag it appears the shorter ones and being used already   Any other suggestions for possible 2 or 3 letter tags?',
 'Come check out JavaOne DAY 3 on twazzup! http://javaone.twazzup.com If you want your own event page, DM @twazzup',
 'Not a cloud in the sky!! So beautiful out!!! To bad its wasted on work!',
 'Updating my bookclub site. http://bookclub.meetup.com/1316 Replying to emails. About to list a po

We create `y` as a numpy array of all labels.

In [6]:
y = np.array(labels, dtype='int8')
y[:20]

array([-1,  1,  1, -1, -1, -1,  1, -1,  1, -1,  1,  1, -1, -1,  1,  1,  1,
        1,  1,  1], dtype=int8)

## Stop Words #1

We create a dataframe `content1` that is processed through nltk's list of stop words. Basically, we go through each tweet and remove each of the 'stop words' that do not add any meaning to make it easier for the model to train on. It also removes other unneccesary information, such as numbers and punctuation.

In [7]:
def full_remove(x, removal_list):
    # function for removing the stop words
    for w in removal_list:
        x = x.replace(w, ' ')
    return x

# remove all digits
digits = [str(x) for x in range(10)]
remove_digits = [full_remove(x, digits) for x in content]

# remove all punctuation
remove_punc = [full_remove(x, list(string.punctuation)) for x in remove_digits]

# make everything lower-case and remove any white space
sents_lower = [x.lower() for x in remove_punc]
sents_lower = [x.strip() for x in sents_lower]

# remove stop words
def removeStopWords(stopWords, txt):
    newtxt = ' '.join([word for word in txt.split() if word not in stopWords])
    return newtxt
content1 = [removeStopWords(stops,x) for x in sents_lower]
content1[:20]

NameError: name 'stops' is not defined

## Vectorizing

explain here

In [8]:
vectorizer = CountVectorizer(analyzer = "word", 
                             preprocessor = None, 
                             stop_words =  'english', 
                             max_features = 6000, ngram_range=(1,5))
data_features = vectorizer.fit_transform(content2)
tfidf_transformer = TfidfTransformer()
data_features_tfidf = tfidf_transformer.fit_transform(data_features)
data_mat = data_features_tfidf.toarray()

np.random.seed(0)
test_index = np.append(np.random.choice((np.where(y==-1))[0], 250, replace=False), np.random.choice((np.where(y==1))[0], 250, replace=False))
train_index = list(set(range(len(labels))) - set(test_index))
train_data = data_mat[train_index,]
train_labels = y[train_index]
test_data = data_mat[test_index,]
test_labels = y[test_index]

# create polarity function and subjectivity function
pol = lambda x: TextBlob(x).sentiment.polarity
sub = lambda x: TextBlob(x).sentiment.subjectivity
pol_list = [pol(x) for x in content2]
sub_list = [sub(x) for x in content2]

NameError: name 'content2' is not defined

In [None]:
for i in range(10):
    print(content2[i], '\t', pol_list[i], sub_list[i])

In [None]:
## Fit logistic classifier on training data
clf = SGDClassifier(loss="log", penalty="none")
clf.fit(train_data, train_labels)
## Pull out the parameters (w,b) of the logistic regression model
w = clf.coef_[0,:]
b = clf.intercept_
## Get predictions on training and test data
preds_train = clf.predict(train_data)
preds_test = clf.predict(test_data)
## Compute errors
errs_train = np.sum((preds_train > 0.0) != (train_labels > 0.0))
errs_test = np.sum((preds_test > 0.0) != (test_labels > 0.0))
print("Training error: ", float(errs_train)/len(train_labels))
print("Test error: ", float(errs_test)/len(test_labels))

In [None]:
## Convert vocabulary into a list:
vocab = np.array([z[0] for z in sorted(vectorizer.vocabulary_.items(), key=lambda x:x[1])])
## Get indices of sorting w
inds = np.argsort(w)
## Words with large negative values
neg_inds = inds[0:50]
print("Highly negative words: ")
# MB: fixed bug here
print([x for x in list(vocab[neg_inds])])
## Words with large positive values
pos_inds = inds[-49:-1]
print("Highly positive words: ")
print([x for x in list(vocab[pos_inds])])

In [None]:
print(clf.predict(vectorizer.transform(["It's a sad movie but very good"])))
print(clf.predict(vectorizer.transform(["Waste of my time"])))
print(clf.predict(vectorizer.transform(["It is not what like"])))
print(clf.predict(vectorizer.transform(["It is not what I m looking for"])))

In [None]:
from sklearn.linear_model import SGDClassifier
svm_clf = SGDClassifier(loss="hinge", penalty='l2')
svm_clf.fit(train_data, train_labels)
svm_preds_test = svm_clf.predict(test_data)
svm_errs_test = np.sum((svm_preds_test > 0.0) != (test_labels > 0.0))
print("Test error: ", float(svm_errs_test)/len(test_labels))

In [None]:
print(svm_clf.predict(vectorizer.transform(["It's a sad movie but very good"])))
print(svm_clf.predict(vectorizer.transform(["Waste of my time"])))
print(svm_clf.predict(vectorizer.transform(["This is not what I like"])))
print(svm_clf.predict(vectorizer.transform(["It is not what I am looking for"])))

In [None]:
max_review_length = 200
tokenizer = Tokenizer(num_words=10000,  #max no. of unique words to keep
                      filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', 
                      lower=True #convert to lower case
                     )
tokenizer.fit_on_texts(content2)

X = tokenizer.texts_to_sequences(content2)
X = pad_sequences(X, maxlen= max_review_length)
print('Shape of data tensor:', X.shape)

Y=pd.get_dummies(y).values

np.random.seed(0)
test_inds = np.append(np.random.choice((np.where(y==-1))[0], 250, replace=False), np.random.choice((np.where(y==1))[0], 250, replace=False))
train_inds = list(set(range(len(labels))) - set(test_inds))
train_data = X[train_inds,]
train_labels = Y[train_inds]
test_data = X[test_inds,]
test_labels = Y[test_inds]

In [None]:
EMBEDDING_DIM = 200
model = Sequential()
model.add(Embedding(10000, EMBEDDING_DIM, input_length=X.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(250, dropout=0.2,return_sequences=True))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model.summary())

In [None]:
epochs = 2
batch_size = 40
model.fit(train_data, train_labels, 
          epochs=epochs, 
          batch_size=batch_size,
          validation_split=0.1)

In [None]:
loss, acc = model.evaluate(test_data, test_labels, verbose=2,
                            batch_size=batch_size)
print(f"Loss: {loss}")
print(f"Validation accuracy: {acc}")

In [None]:
outcome_labels = ['Negative', 'Positive']
new = ["test"]
 
def predict_sentiment(text, pr):
    seq = tokenizer.texts_to_sequences(text)
    padded = pad_sequences(seq, maxlen=max_review_length)
    if pr:
        pred = model.predict(padded)
        print("Probability distribution: ", pred)
        print(f"Is this a Positive or Negative message? '{text[0]}'")
        print(outcome_labels[np.argmax(pred)])
    else:
        pred = model.predict(padded, verbose=0)
    return outcome_labels[np.argmax(pred)]

predict_sentiment(new, True)

In [None]:
print(predict_sentiment(['kys'], True))

In [None]:
dfcovid = pd.read_csv('covid19_tweets.csv')
dfcovid = dfcovid[['text']]
dfcovid['text'] = dfcovid['text'].str.replace(r'https?://\S+', '', case=False)
dfcovid = dfcovid[:5000]
dfcovid

In [None]:
dfcovid['result'] = dfcovid.progress_apply(lambda row: predict_sentiment([row.text], False), axis=1)
dfcovid.head()

In [None]:
# Use seaborn to create a bar graph
sns.countplot(x='result', data=dfcovid, palette='Blues')

# Add labels and show the plot
plt.xlabel('Result')
plt.ylabel('Count')
plt.title('Count of -1 vs 1')
plt.show()