In [1]:
import os
from pathlib import Path
import logging
import itertools

import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer

from sentiment_analysis.config import SentimentAnalysisConfig
from sentiment_analysis.data_access import DataClass
from sentiment_analysis.features import Features
from sentiment_analysis.utils.constants import (
    TEXT,
    TARGET,
    ORIGINAL_TEXT,
    SPLIT,
    TRAIN,
    VALID,
    TEST
)

sns.set_style("darkgrid")
sns.set_color_codes("muted")
matplotlib.rcParams['font.family'] = "serif"
PARENT_PATH = Path(os.getcwd()).parent.absolute()
FORMAT = "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
logging.basicConfig(format=FORMAT, level=logging.INFO)

In [2]:
config = SentimentAnalysisConfig()
config.CURRENT_PATH = PARENT_PATH

data = DataClass(config)
df = data.build()
df.head(10)

2022-12-04 06:29:28,631 - sentiment_analysis.utils.utils - INFO - func:build took: 1.41 sec


Unnamed: 0,review,sentiment,Split
0,how can a director that makes such great films...,neg,development
1,This is one of the worst film adaptations of a...,neg,development
2,This is an art film that was either made in 19...,neg,development
3,"For two of the funniest comedians, the movie w...",neg,development
4,Doris Day never lets a bad script get her down...,neg,development
5,Unwatchable. You can't even make it past the f...,neg,development
6,"Quite average even by Monogram standards, this...",neg,development
7,This is not a good movie. Too preachy in parts...,neg,development
8,I am really shocked that a great director like...,neg,development
9,This is one of the weakest soft porn film arou...,neg,development


In [3]:
features = Features()
df_features = features.build(df)

2022-12-04 06:30:49,988 - sentiment_analysis.utils.utils - INFO - func:clean took: 1 min and                 21.18 sec
2022-12-04 06:31:09,870 - sentiment_analysis.utils.utils - INFO - func:fit took: 19.86 sec
2022-12-04 06:31:28,086 - sentiment_analysis.utils.utils - INFO - func:transform took: 18.21 sec
2022-12-04 06:31:28,087 - sentiment_analysis.utils.utils - INFO - func:fit_transform took: 38.08 sec
2022-12-04 06:31:29,961 - sentiment_analysis.utils.utils - INFO - func:transform took: 1.87 sec
2022-12-04 06:31:48,852 - sentiment_analysis.utils.utils - INFO - func:transform took: 18.89 sec
2022-12-04 06:31:53,860 - sentiment_analysis.utils.utils - INFO - func:build took: 2 min and                 25.07 sec


In [4]:
df_features.head()

Unnamed: 0,review,sentiment,Split,Original Text,char__ 00,char__ 1,char__ 10,char__ 11,char__ 12,char__ 13,...,word__york city,word__youll see,word__young boy,word__young girl,word__young man,word__young woman,word__youre going,word__youre looking,word__youve got,word__youve seen
0,saw movie tv afternoon cant see anyone sit pie...,0,train,I saw this movie on t.v. this afternoon and I ...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,film true historical film useful researching l...,1,train,This film is a true and historical film. It is...,0.0,0.0,0.05085,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,film something like sequel white zombie since ...,0,train,"This film is something like a sequel of ""White...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,like jane austin novel love pride prejudice se...,0,train,i like Jane Austin novels. I love Pride and Pr...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,whoever wrote script movie deserve work hollyw...,0,train,Whoever wrote the script for this movie does n...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
df_features[SPLIT].value_counts()

test          25000
train         22500
validation     2500
Name: Split, dtype: int64