# Project 7: Movie Reviews - Working with text data

For this module, we will be exploring the techniques for working with text data. You will learn about:

* Text cleaning and pre-processing
* Text classification

In [None]:
import pandas as pd

import glob
import os

In [None]:
!ls data/aclImdb/train

In [None]:
!cat data/aclImdb/README

In [None]:
!cat data/aclImdb/train/pos/10000_8.txt

## Load Data

In [None]:
import os
import glob
import pandas as pd

def read_reviews(data_dir, dataset, sentiment, limit=None):
    dfs = []
    i = 0
    # set the path to the directory containing the reviews for this sentiment
    reviews_dir = os.path.join(data_dir, dataset, sentiment)
    print(f'{reviews_dir=}')
    # loop through all the text files in the directory
    for file_path in glob.glob(os.path.join(reviews_dir, "*.txt")):
        # extract the unique id and rating from the file name
        file_name = os.path.basename(file_path)
        unique_id = file_name.split("_")[0]
        rating = file_name.split("_")[1].split(".")[0]

        # read the text file into a pandas DataFrame
        with open(file_path, "r") as f:
            review_text = f.read()
            df = pd.DataFrame({"unique_id": [unique_id], 
                "rating": [rating], "sentiment": [sentiment],
                "review_text": [review_text]})

        # append the DataFrame to the list
        dfs.append(df)
        i += 1
        if limit is not None and i > limit:
            break
    return pd.concat(dfs, ignore_index=True)


data_dir = "data/aclImdb/"
dataset = "train"
df_pos = read_reviews(data_dir, dataset, sentiment='pos', limit=300)
df_neg = read_reviews(data_dir, dataset, sentiment='neg', limit=300)

df = (pd.concat([df_pos, df_neg], axis='index')
   .reset_index(drop=True)
      .astype({'rating': 'int8[pyarrow]',
               'unique_id': 'int64[pyarrow]',
               'sentiment': 'string[pyarrow]',
               'review_text': 'string[pyarrow]'
              })
     )

# print the first five rows of the DataFrame
print(df.sample())

In [None]:
df

## Basic String Manipulation

In [None]:
df.review_text.str.capitalize()

In [None]:
df.review_text.str

In [None]:
print([x for x in dir(df.review_text.str)
           if not x.startswith('_')])

In [None]:
print([x for x in dir('')
           if not x.startswith('_')])

## Remove Stop Words

In [None]:
!pip install spacy

In [None]:
!python -m spacy download en_core_web_sm

In [None]:
!python -m spacy validate

In [None]:
import spacy
nlp = spacy.load('en_core_web_sm')

In [None]:
%%time
def remove_stop(txt, nlp):
    doc = nlp(txt)
    res =  ' '.join(tok.text for tok in doc if not tok.is_stop)
    #print(res)
    return res

(df
 .review_text
 .apply(remove_stop, nlp=nlp)
)

In [None]:
def remove_stop(txt, nlp):
    txt = txt.replace('<br />', '')
    doc = nlp(txt)
    res =  ' '.join(tok.text for tok in doc if not tok.is_stop)
    return res

(df
 .review_text
 .apply(remove_stop, nlp=nlp)
)

In [None]:
df.review_text

## Get Tfidf

In [None]:
!pip install scikit-learn

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
txt_no_stop = df.review_text.apply(remove_stop, nlp=nlp)
sparse = vectorizer.fit_transform(txt_no_stop)

In [None]:
sparse

In [None]:
vectorizer.get_feature_names_out()

In [None]:
tf_df = pd.DataFrame(sparse.todense(), 
                     columns=vectorizer.get_feature_names_out())

In [None]:
pd.concat([df, tf_df], axis='columns')

In [None]:
df.sentiment.value_counts()

## Make Classification Model

In [None]:
!pip install xgboost

In [None]:
import xgboost as xg
from sklearn import model_selection

In [None]:
X = tf_df
y = df.sentiment == 'pos'
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, stratify=y, random_state=42)

In [None]:
y

In [None]:
xgb = xg.XGBClassifier()
xgb.fit(X_train, y_train)

In [None]:
xgb.score(X_test, y_test)

## Predict a New Review

In [None]:
X_new = (pd.DataFrame({'review_text': 
              ['I hated this movie. It was the worst. I don not recommend it',
              'This was the best movie I have ever seen',
              'I think I know how I felt about this move. Both good but weird parts']})
.review_text
.apply(remove_stop, nlp=nlp)
.pipe(lambda ser: 
      pd.DataFrame(vectorizer.transform(ser).todense(),
            columns=vectorizer.get_feature_names_out()))
)

X_new

In [None]:
xgb.predict(X_new)

In [None]:
xgb.predict_proba(X_new)

In [None]:
xgb.classes_