In [1]:
import os
from pathlib import Path
import logging
import itertools

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

from sentiment_analysis.config import SentimentAnalysisConfig
from sentiment_analysis.data_access import DataClass
from sentiment_analysis.features import Features
from sentiment_analysis.utils.constants import (
    TEXT,
    TARGET,
    ORIGINAL_TEXT,
    SPLIT,
    TRAIN,
    VALID,
    TEST
)

sns.set_style("darkgrid")
sns.set_color_codes("muted")
matplotlib.rcParams['font.family'] = "serif"
PARENT_PATH = Path(os.getcwd()).parent.absolute()
FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)

In [2]:
config = SentimentAnalysisConfig()
config.CURRENT_PATH = PARENT_PATH

data = DataClass(config)
df = data.build()
df.head(10)

2022-12-04 06:24:31,735 - sentiment_analysis.utils.utils - INFO - func:build took: 1.40 sec


Unnamed: 0,review,sentiment,Split
0,how can a director that makes such great films...,neg,development
1,This is one of the worst film adaptations of a...,neg,development
2,This is an art film that was either made in 19...,neg,development
3,"For two of the funniest comedians, the movie w...",neg,development
4,Doris Day never lets a bad script get her down...,neg,development
5,Unwatchable. You can't even make it past the f...,neg,development
6,"Quite average even by Monogram standards, this...",neg,development
7,This is not a good movie. Too preachy in parts...,neg,development
8,I am really shocked that a great director like...,neg,development
9,This is one of the weakest soft porn film arou...,neg,development


In [3]:
features = Features()

In [4]:
df = df.sample(10000, random_state=62).copy()

In [5]:
df[ORIGINAL_TEXT] = df[TEXT]

In [6]:
df[TARGET] = df[TARGET].map(config.SENTIMENT_MAP)

In [7]:
df = features.clean(df)

2022-12-04 06:19:54,088 - sentiment_analysis.utils.utils - INFO - func:clean took: 17.10 sec


In [8]:
dev = df[df[SPLIT].isin([DEVELOP])].copy()
test = df[df[SPLIT].isin([TEST])].copy()

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
train, valid, _, _ = train_test_split(
    dev, dev[TARGET], test_size=.1, shuffle=True, random_state=64
)

In [17]:
train[SPLIT], valid[SPLIT] = TRAIN, VALID

In [23]:
train_tf = features.fit_transform(train)

2022-12-04 06:22:48,926 - sentiment_analysis.utils.utils - INFO - func:fit took: 2.26 sec
2022-12-04 06:22:49,532 - sentiment_analysis.utils.utils - INFO - func:transform took: 0.60 sec
2022-12-04 06:22:49,533 - sentiment_analysis.utils.utils - INFO - func:fit_transform took: 2.86 sec


In [24]:
valid_tf, test_tf = features.transform(valid), features.transform(test)

2022-12-04 06:23:17,088 - sentiment_analysis.utils.utils - INFO - func:transform took: 0.10 sec
2022-12-04 06:23:17,998 - sentiment_analysis.utils.utils - INFO - func:transform took: 0.91 sec


In [31]:
valid_tf

Unnamed: 0,review,sentiment,Split,Original Text,word__10 10,word__10 minute,word__10 year,word__12 year,word__14 year,word__15 minute,...,word__youll see,word__young boy,word__young girl,word__young man,word__young men,word__young woman,word__youre going,word__youre looking,word__youve got,word__youve seen
0,cool j performed much better movie expected fa...,0,validation,LL Cool J performed much better in this movie ...,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,watched movie really late last night usually l...,0,validation,I watched this movie really late last night an...,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,must admit eros movie collection one love well...,1,validation,"I must admit, out of the EROS MOVIE COLLECTION...",0.284412,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,entertained throughout insightful documentary ...,1,validation,I was so entertained throughout this insightfu...,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,watched video friend house im glad waste money...,0,validation,I watched this video at a friend's house. I'm ...,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
489,one person say movie beautiful delicate explor...,1,validation,"So one person says, ""This movie is a beautiful...",0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
490,moment film dreadful teeth ache knowing week l...,1,validation,There are moments in the film that are so drea...,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
491,try funny fails miserably animation terrible l...,0,validation,tries to be funny and fails miserably. The ani...,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
492,movie epitomizes dd fear 80 even today fear pe...,0,validation,This is the movie that epitomizes the D&D fear...,0.000000,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
        df = self.clean(df)
        dev = df[df[SPLIT].isin([DEVELOP])].copy()
        test = df[df[SPLIT].isin([TEST])].copy()

In [3]:
features = Features()
df = df.sample(10000, random_state=62).copy()
df_features = features.build(df)

2022-12-04 06:24:54,631 - sentiment_analysis.utils.utils - INFO - func:clean took: 17.33 sec
2022-12-04 06:24:56,729 - sentiment_analysis.utils.utils - INFO - func:fit took: 2.09 sec
2022-12-04 06:24:57,310 - sentiment_analysis.utils.utils - INFO - func:transform took: 0.58 sec
2022-12-04 06:24:57,311 - sentiment_analysis.utils.utils - INFO - func:fit_transform took: 2.67 sec
2022-12-04 06:24:57,373 - sentiment_analysis.utils.utils - INFO - func:transform took: 0.06 sec
2022-12-04 06:24:58,044 - sentiment_analysis.utils.utils - INFO - func:transform took: 0.67 sec
2022-12-04 06:24:58,179 - sentiment_analysis.utils.utils - INFO - func:build took: 20.88 sec


In [5]:
df_features

Unnamed: 0,review,sentiment,Split,Original Text,word__10 10,word__10 minute,word__10 year,word__12 year,word__14 year,word__15 minute,...,word__youll see,word__young boy,word__young girl,word__young man,word__young men,word__young woman,word__youre going,word__youre looking,word__youve got,word__youve seen
0,white fire much going larry bird look alike ro...,1,train,White Fire has so much going for it. With Larr...,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.719512,0.0,0.0
1,yesterday seen second part must say actually b...,0,train,I have yesterday seen the second part. And I m...,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
2,film weak plot weak characterization really we...,0,train,"This film has a weak plot, weak characterizati...",0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
3,movie surprised good way box got impression ac...,1,train,This movie surprised me in a good way. From th...,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
4,movie good tv like im huge fan disaster film e...,1,train,This movie is good for TV. I like it because I...,0.0,0.000000,0.0,0.323651,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,enjoyed movie expected enough action intrigue ...,1,test,I enjoyed this movie. More than I expected. It...,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
9996,watched movie last night within 30 minute star...,0,test,I just watched this movie last night. Within 3...,0.0,0.163432,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
9997,came surprise depressing draining movie impact...,1,test,It came as no surprise to me that this was a v...,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
9998,emperor new groove cast return disney picture ...,0,test,The EMPEROR'S NEW GROOVE cast returns for Disn...,0.0,0.000000,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0
