In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# 1. Exploratory Data Analysis

In [3]:
#read in csv file data
#read train rows from 
trainCSV = pd.read_csv('train.csv', names = ["Index", "Sentiment", "Text"], skiprows=770000, nrows = 50000)
#using a subset of data from the train.csv file, a total of 50,0000 roughly 5% of the data as using the entire dataset led to a 
#long time for data processing
#may lead to model skewing to using a subset of the data, tried to roughly do a 50/50 or 60/40 of negative/positive sentiments
#to ensure that the data could have roughly the same amount of data for both sentiments to build the model
testCSV = pd.read_csv('test.csv')

rows = len(trainCSV)
rows2 = len(testCSV)

#check proper number of rows from the csv files
print("# of rows from trainCSV " + str(rows))
print("# of rows from testCSV " + str(rows2))

# of rows from trainCSV 50000
# of rows from testCSV 359


In [4]:
#check associated # of sentiment values from train csv
numNeg = ((trainCSV['Sentiment'] == 0).sum())
print("# of negatives from trainCSV " + str(numNeg))

numPos = ((trainCSV['Sentiment'] == 1).sum())
print("# of positives from trainCSV " + str(numPos))

#check if there are any improper sentiment values in traincsv
numNA = ((trainCSV['Sentiment'] >= 2).sum())
print("# of neither positive or negative from trainCSV " + str(numNA))

# of negatives from trainCSV 30001
# of positives from trainCSV 19999
# of neither positive or negative from trainCSV 0


In [5]:
#check if there are null values in the csv files
print("Null values in trainCSV")
print(trainCSV.isnull().sum())

print("Null values in testCSV")
print(testCSV.isnull().sum())

Null values in trainCSV
Index        0
Sentiment    0
Text         0
dtype: int64
Null values in testCSV
Index        0
Sentiment    0
Text         0
dtype: int64


# 2: Text Preprocessing.

In [6]:
#change all text to lowercase
trainCSV['Text'] = trainCSV['Text'].str.lower();
testCSV['Text'] = trainCSV['Text'].str.lower();

#check to make sure operation worked and changed to lowercase
print(trainCSV)

        Index  Sentiment                                               Text
0      769999          0  @breelin i need to make it silly lol. i miss you 
1      770000          0  away upstairs to bed. gonna miss you twitter. ...
2      770001          0   movie theater here isnt doing midnight showin...
3      770002          0  home and bored. feel like chattin to people on...
4      770003          0         @traviswfreeman why are you leaving us??? 
...       ...        ...                                                ...
49995  819994          1                        @primaryposition ok thanks 
49996  819995          1  we just met some awesome people at t.g.i fridays 
49997  819996          1  oh my goodness . my feet are the most sensitiv...
49998  819997          1                                  @hey_angy hahaha 
49999  819998          1  @lamartian30 pfft school is koo. i'm ranked 2n...

[50000 rows x 3 columns]


In [7]:
#check to make sure operation worked and changed to lowercase
print(testCSV)

     Index  Sentiment                                               Text
0        0          1  @breelin i need to make it silly lol. i miss you 
1        1          1  away upstairs to bed. gonna miss you twitter. ...
2        2          1   movie theater here isnt doing midnight showin...
3        3          1  home and bored. feel like chattin to people on...
4        4          1         @traviswfreeman why are you leaving us??? 
..     ...        ...                                                ...
354    492          1  @reallamarodom i miss the lakers already  don'...
355    494          0  @kandydevil  why do they have to be so expensi...
356    495          1  @firstpersonarts is slam tonight? i always los...
357    496          0  i'm so boreeeeed. . . . . . . . . . . . . . . ...
358    497          0  about to starve but i just don't know what i w...

[359 rows x 3 columns]


In [8]:
#remove special characters
trainCSV['Text'] = trainCSV['Text'].str.replace('[^a-zA-Z0-9 \n\.]', '')
testCSV['Text'] = testCSV['Text'].str.replace('[^a-zA-Z0-9 \n\.]', '')

#remove numerical numbers
trainCSV['Text'] = trainCSV['Text'].str.replace('\d', '')
testCSV['Text'] = testCSV['Text'].str.replace('\d', '')

#print for visual check that special characters and digits removed
print(trainCSV)

        Index  Sentiment                                               Text
0      769999          0   breelin i need to make it silly lol. i miss you 
1      770000          0  away upstairs to bed. gonna miss you twitter. ...
2      770001          0   movie theater here isnt doing midnight showin...
3      770002          0  home and bored. feel like chattin to people on...
4      770003          0             traviswfreeman why are you leaving us 
...       ...        ...                                                ...
49995  819994          1                         primaryposition ok thanks 
49996  819995          1  we just met some awesome people at t.g.i fridays 
49997  819996          1  oh my goodness . my feet are the most sensitiv...
49998  819997          1                                    heyangy hahaha 
49999  819998          1  lamartian pfft school is koo. im ranked nd in ...

[50000 rows x 3 columns]


  trainCSV['Text'] = trainCSV['Text'].str.replace('[^a-zA-Z0-9 \n\.]', '')
  testCSV['Text'] = testCSV['Text'].str.replace('[^a-zA-Z0-9 \n\.]', '')
  trainCSV['Text'] = trainCSV['Text'].str.replace('\d', '')
  testCSV['Text'] = testCSV['Text'].str.replace('\d', '')


In [9]:
#print for visual check that special characters and digits removed
print(testCSV)

     Index  Sentiment                                               Text
0        0          1   breelin i need to make it silly lol. i miss you 
1        1          1  away upstairs to bed. gonna miss you twitter. ...
2        2          1   movie theater here isnt doing midnight showin...
3        3          1  home and bored. feel like chattin to people on...
4        4          1             traviswfreeman why are you leaving us 
..     ...        ...                                                ...
354    492          1  reallamarodom i miss the lakers already  dont ...
355    494          0  kandydevil  why do they have to be so expensiv...
356    495          1  firstpersonarts is slam tonight i always lose ...
357    496          0  im so boreeeeed. . . . . . . . . . . . . . . ....
358    497          0  about to starve but i just dont know what i wa...

[359 rows x 3 columns]


# 3: Linguistic Feature Extraction.

In [12]:
#NLP using bag of words, token each word
tokens = CountVectorizer()

trainWords = tokens.fit_transform(trainCSV['Text'])
testWords = tokens.transform(testCSV['Text'])

# 4: Build your sentiment classification model.

In [30]:
#building a logistic regression model
model = LogisticRegression(max_iter = 50000)
trainY = trainCSV['Sentiment'].values
testY = testCSV['Sentiment'].values

model.fit(trainWords,trainY);

test_predicted = model.predict(testWords)
test_prebability = model.predict_proba(testWords)

# 5: Model evaluation.

In [31]:
print(classification_report(test_predicted, testY))

              precision    recall  f1-score   support

           0       0.94      0.49      0.65       336
           1       0.07      0.52      0.12        23

    accuracy                           0.50       359
   macro avg       0.50      0.51      0.38       359
weighted avg       0.88      0.50      0.61       359



In [None]:
#model evaluation has high precision for predicting negative sentiment from 
#the test csv, but has low precision for predicting positive sentiment from
#the test csv
#this may be caused using a subset of the train csv leading to data skewing
#as I used 30,000 negative sentiments and 20,000 positive sentiments giving
#the model more sentiment data to analyze for negative sentiments compared
#to positive sentiments
#would be interesting to run the entirety of train csv data at a later time
#to give it the extra time it needs to process the entire dataset and compare
#results