# NLTK Pre-trained model
Analyzed using NLTK Sentiment Intensity Analyzer.
* Preproccessed by removed duplicated.

## Import Packages and Enviroment Variable

In [1]:
import pandas as pd

import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import os

In [7]:
nltk.download('all') #first time use

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /home/allen/nltk_data...
[nltk_data]    |   Unzipping corpora/abc.zip.
[nltk_data]    | Downloading package alpino to
[nltk_data]    |     /home/allen/nltk_data...
[nltk_data]    |   Unzipping corpora/alpino.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /home/allen/nltk_data...
[nltk_data]    |   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_eng to
[nltk_data]    |     /home/allen/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_eng.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /home/allen/nltk_data...
[nltk_data]    |   Unzipping
[nltk_data]    |       taggers/averaged_perceptron_tagger_ru.zip.
[nltk_data]    | Downloading package averaged_perceptron_tagger_rus to
[nltk_d

True

In [8]:
DATA_PATH = "data/"
DIR_PATH = os.getcwd()
TRAIN_DATA_PATH = os.path.join(DIR_PATH, DATA_PATH, "train.csv")
TEST_DATA_PATH = os.path.join(DIR_PATH, DATA_PATH, "test.csv")

## Load Data and Preprocessed

In [None]:
train_data = pd.read_csv(TRAIN_DATA_PATH)
test_data = pd.read_csv(TEST_DATA_PATH)

In [12]:
#Remove Duplicate
train_data = train_data.drop_duplicates()

In [13]:
# create preprocess_text function
def preprocess_text(text):
    '''
    Preprocess the input text by tokenizing, removing stop words, and lemmatizing.
    '''
    # Tokenize the text
    tokens = word_tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words('english')]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string
    processed_text = ' '.join(lemmatized_tokens)
    return processed_text

In [36]:
train_data['numeric_sentiment'] = train_data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

In [14]:
train_data['review'] = train_data['review'].apply(preprocess_text)
train_data.head()

Unnamed: 0,review,sentiment
0,single worst film 've ever seen theater . saw ...,negative
1,actually around 13 year old camping near mcclo...,positive
2,small town attacked horde bloodthirsty vampire...,negative
3,think problem show getting respect truly deser...,positive
4,"wow , movie horrible . bill fan really looking...",negative


## NLTK Sentiment Analysis

In [18]:
analyzer = SentimentIntensityAnalyzer()

In [19]:
analyzer.polarity_scores("This is a great movie!")

{'neg': 0.0, 'neu': 0.406, 'pos': 0.594, 'compound': 0.6588}

In [30]:
def analyze_sentiment(text):
    '''
    Analyze the sentiment of the input text using VADER sentiment analysis.
    Returns 'positive', 'negative', or 'neutral' based on the compound score.
    '''
    scores = analyzer.polarity_scores(text)
    sentiment = 1 if scores['pos'] > 0.2 else 0
    return sentiment



In [31]:
train_data['predicted_sentiment'] = train_data['review'].apply(analyze_sentiment)

In [32]:
train_data['predicted_sentiment'].value_counts()

predicted_sentiment
1    21486
0    18182
Name: count, dtype: int64

## Evaluation

In [39]:
from sklearn.metrics import confusion_matrix, classification_report

In [40]:
confusion_matrix(train_data['numeric_sentiment'], train_data['predicted_sentiment'])

array([[12470,  7305],
       [ 5712, 14181]])

In [42]:
print(classification_report(train_data['numeric_sentiment'], train_data['predicted_sentiment'])   )

              precision    recall  f1-score   support

           0       0.69      0.63      0.66     19775
           1       0.66      0.71      0.69     19893

    accuracy                           0.67     39668
   macro avg       0.67      0.67      0.67     39668
weighted avg       0.67      0.67      0.67     39668

