In [1]:
import pandas as pd
import numpy as np

In [3]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test_no_answer.csv')

In [4]:
df_train

Unnamed: 0,row_id,TEXT,LABEL
0,0,good to know if you can t find these elsewhere .,1
1,1,love it ! the grill plates come out and pop i...,1
2,2,i m convinced this was a poorly executed refur...,0
3,3,i would never have complained about that if it...,0
4,4,"the photo shows the same whole , large candie...",0
...,...,...,...
10995,10995,i didn t quite get it the first time .,1
10996,10996,i ve tried installing with and without the oem...,0
10997,10997,i was parked at a truck stop in the cincinnati...,0
10998,10998,i recently bought this case after seeing some ...,1


In [6]:
# there is no .isupper()
uppercase_count = 0
for index, row in df_train.iterrows():
    if row['TEXT'].isupper():
        uppercase_count += 1
uppercase_count

0

### Exploring: frequency distributions
https://realpython.com/python-nltk-sentiment-analysis/#using-nltks-pre-trained-sentiment-analyzer

In [9]:
import nltk
from pprint import pprint

In [14]:
df_train.loc[0]

row_id                                                   0
TEXT      good to know if you can t find these elsewhere .
LABEL                                                    1
Name: 0, dtype: object

In [16]:
text = df_train.loc[0]['TEXT']
pprint(nltk.word_tokenize(text), compact=True)

['good', 'to', 'know', 'if', 'you', 'can', 't', 'find', 'these', 'elsewhere',
 '.']


In [19]:
# tokenize all sentence
# there is no .isupper()
full_text = ''
for index, row in df_train.iterrows():
    full_text += row['TEXT']

In [23]:
# find most common words
words = nltk.word_tokenize(full_text)
fd = nltk.FreqDist(words)

In [24]:
fd.most_common(3)

[('the', 5925), (',', 4510), ('and', 3967)]

In [26]:
fd.tabulate(20)

 the    ,  and   it    a   to    i   .i   is  for   of this   in   my that  not with    t have   on 
5925 4510 3967 3621 3487 3364 2889 2858 2282 2019 1962 1944 1453 1388 1259 1171 1159 1140 1136 1111 


### Extracting Concordance and Collocations

In [28]:
fd.concordance("purchased", lines=5)

AttributeError: 'FreqDist' object has no attribute 'concordance'

### Using NLTK’s Pre-Trained Sentiment Analyzer
NLTK already has a built-in, pretrained sentiment analyzer called VADER (Valence Aware Dictionary and sEntiment Reasoner).

In [30]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Weber\AppData\Roaming\nltk_data...


True

In [31]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, NLTK is really powerful!")

{'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}

In [39]:
predict = []

In [32]:
def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(tweet)["compound"] > 0

In [40]:
for index, row in df_train.iterrows():
    if is_positive(row['TEXT']):
        predict.append(1)
    else:
        predict.append(0)

In [47]:
pd.Series(predict).value_counts()

0    5916
1    5084
dtype: int64

In [48]:
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(df_train['LABEL'], predict)

array([[3435, 2065],
       [2481, 3019]], dtype=int64)

[ tn, fp ]  
[ fn, tp ]

In [49]:
tn, fp, fn, tp = confusion_matrix(df_train['LABEL'], predict).ravel()
(tn, fp, fn, tp)

(3435, 2065, 2481, 3019)

In [50]:
print(F"{(tn+tp) / len(predict):.2%} correct")

58.67% correct


In [51]:
print(F"{tp / (tp + fn):.2%} recall") #TP/(TP+FN)

54.89% recall


In [None]:
# test set