# Project 7: Movie Reviews - Working with text data

For this module, we will be exploring the techniques for working with text data. You will learn about:

* Text cleaning and pre-processing
* Text classification

In [1]:
import pandas as pd
import polars as pl

import glob
import os

In [2]:
!ls data/aclImdb/train

'ls' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
!cat data/aclImdb/README

'cat' is not recognized as an internal or external command,
operable program or batch file.


In [4]:
!cat data/aclImdb/train/pos/10000_8.txt

'cat' is not recognized as an internal or external command,
operable program or batch file.


## Load Data

In [5]:
import os
import glob
import pandas as pd

def read_reviews(data_dir:str, dataset:str, sentiment:str, limit:int=None) -> pd.DataFrame:
    dfs = []
    i = 0
    # set the path to the directory containing the reviews for this sentiment
    reviews_dir = os.path.join(data_dir, dataset, sentiment)
    print(f'{reviews_dir=}')
    # loop through all the text files in the directory
    for file_path in glob.glob(os.path.join(reviews_dir, "*.txt")):
        # extract the unique id and rating from the file name
        file_name = os.path.basename(file_path)
        unique_id = file_name.split("_")[0]
        rating = file_name.split("_")[1].split(".")[0]

        # read the text file into a pandas DataFrame
        with open(file_path, "r") as f:
            review_text = f.read()
            df = pd.DataFrame({"unique_id": [unique_id], 
                "rating": [rating], "sentiment": [sentiment],
                "review_text": [review_text]})

        # append the DataFrame to the list
        dfs.append(df)
        i += 1
        if limit is not None and i > limit:
            break
    return pd.concat(dfs, ignore_index=True)


data_dir = "data/aclImdb/"
dataset = "train"
df_pos = read_reviews(data_dir, dataset, sentiment='pos', limit=300)
df_neg = read_reviews(data_dir, dataset, sentiment='neg', limit=300)

df = (pd.concat([df_pos, df_neg], axis='index')
   .reset_index(drop=True)
      .astype({'rating': 'int8[pyarrow]',
               'unique_id': 'int64[pyarrow]',
               'sentiment': 'string[pyarrow]',
               'review_text': 'string[pyarrow]'
              })
     )

# print the first five rows of the DataFrame
print(df.sample())

reviews_dir='data/aclImdb/train\\pos'
reviews_dir='data/aclImdb/train\\neg'
     unique_id  rating sentiment  \
492      10172       1       neg   

                                           review_text  
492  When I spotted that Noah Wyle and Ricky Schrod...  


In [6]:
df

Unnamed: 0,unique_id,rating,sentiment,review_text
0,0,9,pos,Bromwell High is a cartoon comedy. It ran at t...
1,10000,8,pos,Homelessness (or Houselessness as George Carli...
2,10001,10,pos,Brilliant over-acting by Lesley Ann Warren. Be...
3,10002,7,pos,This is easily the most underrated film inn th...
4,10003,8,pos,This is not the typical Mel Brooks film. It wa...
...,...,...,...,...
597,10267,1,neg,This piece ain't really worth a comment.. It's...
598,10268,1,neg,Without a doubt this is one of the worst films...
599,10269,1,neg,"this movie is outrageous. by outrageous, i mea..."
600,1026,3,neg,Wow. I do not think I have ever seen a movie w...


In [9]:
df_pl = pl.from_pandas(df)

In [10]:
df_pl

unique_id,rating,sentiment,review_text
i64,i8,str,str
0,9,"""pos""","""Bromwell High …"
10000,8,"""pos""","""Homelessness (…"
10001,10,"""pos""","""Brilliant over…"
10002,7,"""pos""","""This is easily…"
10003,8,"""pos""","""This is not th…"
10004,8,"""pos""","""This isn't the…"
10005,7,"""pos""","""Yes its an art…"
10006,7,"""pos""","""In this ""criti…"
10007,7,"""pos""","""THE NIGHT LIST…"
10008,7,"""pos""","""You know, Robi…"


## Basic String Manipulation

In [7]:
df.review_text.str.capitalize()

0      Bromwell high is a cartoon comedy. it ran at t...
1      Homelessness (or houselessness as george carli...
2      Brilliant over-acting by lesley ann warren. be...
3      This is easily the most underrated film inn th...
4      This is not the typical mel brooks film. it wa...
                             ...                        
597    This piece ain't really worth a comment.. it's...
598    Without a doubt this is one of the worst films...
599    This movie is outrageous. by outrageous, i mea...
600    Wow. i do not think i have ever seen a movie w...
601    "american nightmare" is officially tied, in my...
Name: review_text, Length: 602, dtype: string

In [8]:
df.review_text.str

<pandas.core.strings.accessor.StringMethods at 0x1a58a2daae0>

In [27]:
print([x for x in dir(df.review_text.str)
           if not x.startswith('_')])

['capitalize', 'casefold', 'cat', 'center', 'contains', 'count', 'decode', 'encode', 'endswith', 'extract', 'extractall', 'find', 'findall', 'fullmatch', 'get', 'get_dummies', 'index', 'isalnum', 'isalpha', 'isdecimal', 'isdigit', 'islower', 'isnumeric', 'isspace', 'istitle', 'isupper', 'join', 'len', 'ljust', 'lower', 'lstrip', 'match', 'normalize', 'pad', 'partition', 'removeprefix', 'removesuffix', 'repeat', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'slice', 'slice_replace', 'split', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'wrap', 'zfill']


In [28]:
print([x for x in dir('')
           if not x.startswith('_')])

['capitalize', 'casefold', 'center', 'count', 'encode', 'endswith', 'expandtabs', 'find', 'format', 'format_map', 'index', 'isalnum', 'isalpha', 'isascii', 'isdecimal', 'isdigit', 'isidentifier', 'islower', 'isnumeric', 'isprintable', 'isspace', 'istitle', 'isupper', 'join', 'ljust', 'lower', 'lstrip', 'maketrans', 'partition', 'removeprefix', 'removesuffix', 'replace', 'rfind', 'rindex', 'rjust', 'rpartition', 'rsplit', 'rstrip', 'split', 'splitlines', 'startswith', 'strip', 'swapcase', 'title', 'translate', 'upper', 'zfill']


## Remove Stop Words

In [None]:
# !pip install spacy

In [30]:
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     --------------------------------------- 0.0/12.8 MB 445.2 kB/s eta 0:00:29
     --------------------------------------- 0.1/12.8 MB 573.4 kB/s eta 0:00:23
      --------------------------------------- 0.2/12.8 MB 1.1 MB/s eta 0:00:12
     - -------------------------------------- 0.4/12.8 MB 1.8 MB/s eta 0:00:08
     - -------------------------------------- 0.6/12.8 MB 2.5 MB/s eta 0:00:05
     -- ------------------------------------- 0.9/12.8 MB 3.1 MB/s eta 0:00:04
     --- ------------------------------------ 1.3/12.8 MB 3.8 MB/s eta 0:00:04
     ---- ----------------------------------- 1.6/12.8 MB 4.1 MB/s eta 0:00:03
     ----- ---------------------------------- 


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [31]:
!python -m spacy validate


⠙ Loading compatibility table...
⠹ Loading compatibility table...
⠸ Loading compatibility table...
⠼ Loading compatibility table...
⠴ Loading compatibility table...
[2K[38;5;2m✔ Loaded compatibility table[0m
[1m
[38;5;4mℹ spaCy installation: c:\Users\jborr\Documents\Eng\Talk Python
Training\env\Lib\site-packages\spacy[0m

NAME             SPACY            VERSION                            
en_core_web_sm   >=3.7.2,<3.8.0   [38;5;2m3.7.1[0m   [38;5;2m✔[0m



In [32]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [33]:
%%time
def remove_stop(txt, nlp):
    doc = nlp(txt)
    res =  ' '.join(tok.text for tok in doc if not tok.is_stop)
    #print(res)
    return res

(df
 .review_text
 .apply(remove_stop, nlp=nlp)
)

CPU times: total: 21 s
Wall time: 22 s


0      Bromwell High cartoon comedy . ran time progra...
1      Homelessness ( Houselessness George Carlin sta...
2      Brilliant - acting Lesley Ann Warren . Best dr...
3      easily underrated film inn Brooks cannon . Sur...
4      typical Mel Brooks film . slapstick movies act...
                             ...                        
597    piece ai worth comment .. simply worst " horro...
598    doubt worst films wasted money ! plot , erm so...
599    movie outrageous . outrageous , mean awful . f...
600    Wow . think seen movie great actors pivotal ro...
601    " American Nightmare " officially tied , opini...
Name: review_text, Length: 602, dtype: object

In [34]:
def remove_stop(txt, nlp):
    txt = txt.replace('<br />', '')
    doc = nlp(txt)
    res =  ' '.join(tok.text for tok in doc if not tok.is_stop)
    return res

(df
 .review_text
 .apply(remove_stop, nlp=nlp)
)

0      Bromwell High cartoon comedy . ran time progra...
1      Homelessness ( Houselessness George Carlin sta...
2      Brilliant - acting Lesley Ann Warren . Best dr...
3      easily underrated film inn Brooks cannon . Sur...
4      typical Mel Brooks film . slapstick movies act...
                             ...                        
597    piece ai worth comment .. simply worst " horro...
598    doubt worst films wasted money ! plot , erm so...
599    movie outrageous . outrageous , mean awful . f...
600    Wow . think seen movie great actors pivotal ro...
601    " American Nightmare " officially tied , opini...
Name: review_text, Length: 602, dtype: object

In [35]:
df.review_text

0      Bromwell High is a cartoon comedy. It ran at t...
1      Homelessness (or Houselessness as George Carli...
2      Brilliant over-acting by Lesley Ann Warren. Be...
3      This is easily the most underrated film inn th...
4      This is not the typical Mel Brooks film. It wa...
                             ...                        
597    This piece ain't really worth a comment.. It's...
598    Without a doubt this is one of the worst films...
599    this movie is outrageous. by outrageous, i mea...
600    Wow. I do not think I have ever seen a movie w...
601    "American Nightmare" is officially tied, in my...
Name: review_text, Length: 602, dtype: string

## Get Tfidf

In [None]:
# !pip install scikit-learn

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
txt_no_stop = df.review_text.apply(remove_stop, nlp=nlp)
sparse = vectorizer.fit_transform(txt_no_stop)

In [37]:
sparse

<602x12327 sparse matrix of type '<class 'numpy.float64'>'
	with 51987 stored elements in Compressed Sparse Row format>

In [None]:
vectorizer.get_feature_names_out()

In [38]:
tf_df = pd.DataFrame(sparse.todense(), 
                     columns=vectorizer.get_feature_names_out())

In [39]:
pd.concat([df, tf_df], axis='columns')

Unnamed: 0,unique_id,rating,sentiment,review_text,000,00am,02,05,06,07,...,zombiez,zombified,zone,zoom,zooms,zorro,zsigmond,zu,zubeidaa,â½
0,0,9,pos,Bromwell High is a cartoon comedy. It ran at t...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10000,8,pos,Homelessness (or Houselessness as George Carli...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10001,10,pos,Brilliant over-acting by Lesley Ann Warren. Be...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10002,7,pos,This is easily the most underrated film inn th...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10003,8,pos,This is not the typical Mel Brooks film. It wa...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
597,10267,1,neg,This piece ain't really worth a comment.. It's...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
598,10268,1,neg,Without a doubt this is one of the worst films...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
599,10269,1,neg,"this movie is outrageous. by outrageous, i mea...",0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
600,1026,3,neg,Wow. I do not think I have ever seen a movie w...,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [40]:
df.sentiment.value_counts()

sentiment
pos    301
neg    301
Name: count, dtype: int64[pyarrow]

## Make Classification Model

In [None]:
# !pip install xgboost

In [41]:
import xgboost as xg
from sklearn import model_selection

In [42]:
X = tf_df
y = df.sentiment == 'pos'
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y, random_state=42)

In [43]:
y

0       True
1       True
2       True
3       True
4       True
       ...  
597    False
598    False
599    False
600    False
601    False
Name: sentiment, Length: 602, dtype: bool[pyarrow]

In [44]:
xgb = xg.XGBClassifier()
xgb.fit(X_train, y_train)

In [45]:
xgb.score(X_test, y_test)

0.7814569536423841

## Predict a New Review

In [None]:
X_new = (pd.DataFrame({'review_text': 
              ['I hated this movie. It was the worst. I don not recommend it',
              'This was the best movie I have ever seen',
              'I think I know how I felt about this move. Both good but weird parts']})
.review_text
.apply(remove_stop, nlp=nlp)
.pipe(lambda ser: 
      pd.DataFrame(vectorizer.transform(ser).todense(),
            columns=vectorizer.get_feature_names_out()))
)

X_new

In [None]:
xgb.predict(X_new)

In [None]:
xgb.predict_proba(X_new)

In [None]:
xgb.classes_