In [1]:
import pandas as pd
import numpy as np
import nltk

In [2]:
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test_no_answer.csv')

### Using NLTK’s Pre-Trained Sentiment Analyzer
NLTK already has a built-in, pretrained sentiment analyzer called VADER (Valence Aware Dictionary and sEntiment Reasoner).

In [3]:
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Weber\AppData\Roaming\nltk_data...


True

try some analysis

In [4]:
from nltk.sentiment import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()
sia.polarity_scores("Wow, NLTK is really powerful!")

{'neg': 0.0, 'neu': 0.295, 'pos': 0.705, 'compound': 0.8012}

In [5]:
def is_positive(tweet: str) -> bool:
    """True if tweet has positive compound sentiment, False otherwise."""
    return sia.polarity_scores(tweet)["compound"] > 0

In [6]:
predict = []
for index, row in df_train.iterrows():
    if is_positive(row['TEXT']):
        predict.append(1)
    else:
        predict.append(0)

### View result on training set

In [7]:
pd.Series(predict).value_counts()

0    5916
1    5084
dtype: int64

In [8]:
# confusion matrix
from sklearn.metrics import confusion_matrix
confusion_matrix(df_train['LABEL'], predict)

array([[3435, 2065],
       [2481, 3019]], dtype=int64)

In [9]:
tn, fp, fn, tp = confusion_matrix(df_train['LABEL'], predict).ravel()
(tn, fp, fn, tp)

(3435, 2065, 2481, 3019)

In [10]:
print(F"{(tn+tp) / len(predict):.2%} correct")
print(F"{tp / (tp + fn):.2%} recall") #TP/(TP+FN)

58.67% correct
54.89% recall


### Predict on Test set

In [11]:
test_prediction = []
for index, row in df_test.iterrows():
    if is_positive(row['TEXT']):
        test_prediction.append(1)
    else:
        test_prediction.append(0)

In [12]:
pd.Series(predict).value_counts()

0    5916
1    5084
dtype: int64

Create submission file (execute  carefully!)

In [15]:
submission = {
    'row_id': list(range(0,len(test_prediction))),
    'LABEL': test_prediction
}
submission = pd.DataFrame.from_dict(submission)
submission.head(3)

Unnamed: 0,row_id,LABEL
0,0,0
1,1,1
2,2,1


In [16]:
submission.to_csv('submission/sub_NLTK.csv', index=False)