# Preprocessing - "The Office" dataset
This notebook aims to provide parameterizable functions to preprocess the "The Office" dataset for further NLP analysis. 

In [5]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import contractions

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer


PATH = "../data/"
FILE = "unmod_the-office-lines - scripts.csv"

In [6]:
df = pd.read_csv(PATH+FILE, sep=",", index_col="id")

In [7]:
# concatenate line_text for each scene
def concatenate_scenes(df):
    df = df.groupby(["season", "episode", "scene"])["line_text"].apply(lambda x: " ".join(x)).reset_index()
    return df

In [8]:
def extract_directorals(df):
    # extract text from line_text in square brackets, put it in new column called "directionals", multiple square brackets will be extracted as a list
    df["directionals"] = df["line_text"].str.extractall(r"\[(.*?)\]").unstack().apply(lambda x: ", ".join(x.dropna()), axis=1)
    # delete the extracted text from line_text
    df["line_text"] = df["line_text"].str.replace(r"\[(.*?)\]", "", regex=True).str.strip()
    return df


In [9]:
# bare string preprocessing
def remove_punctuation(df):
    return df["line_text"].str.replace(r"[^\w\s]", "", regex=True)

def lower(df):
    return df["line_text"].apply(lambda x: x.lower())

def remove_stopwords(df):
    stop_words = set(stopwords.words('english'))
    return df["line_text"].apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in stop_words]))

def expanding_contractions(df):
    return df["line_text"].apply(lambda x: contractions.fix(x))


In [10]:
def tokenize(df):
    return df["line_text"].apply(lambda x: word_tokenize(x))

# TODO: make this paremeterizable
def segmentation(df):
    return df["line_text"].apply(lambda x: x.split(" "))

def lemmatize(df):
    wordnet_lemmatizer = WordNetLemmatizer()
    # is working, but not very good results because of the simple speech of the characters
    return df["line_text"].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

def stem(df):
    porter_stemmer = PorterStemmer()
    return df["line_text"].apply(lambda x: " ".join([porter_stemmer.stem(word) for word in word_tokenize(x)]))

# tagging
def pos_tag(df):
    return df["line_text"].apply(lambda x: nltk.pos_tag(word_tokenize(x)))

In [11]:
def preprocess(
        df, 
        concat_scenes=False, 
        extract_direc=False, 
        remove_punct=False, 
        rmv_stopwords=False, 
        lwr=False, 
        exp_contractions=False, 
        conversion:str=None
        )->pd.DataFrame:
    if (concat_scenes):
        df = concatenate_scenes(df)
    if (extract_direc):
        df = extract_directorals(df)

    if (remove_punct):
        df['line_text'] = remove_punctuation(df)
    if (lwr):
        df['line_text'] = lower(df)
    if (rmv_stopwords):
        df['line_text'] = remove_stopwords(df)
    if (exp_contractions):
        df['line_text'] = expanding_contractions(df)   

    if (conversion == "tokenize"):
        df['line_text']  = tokenize(df)
    elif (conversion == "segment"):
        df['line_text'] = segmentation(df)
    elif (conversion == "lemmatize"):
        df['line_text'] = lemmatize(df)
    elif (conversion == "stem"):
        df['line_text'] = stem(df)
    elif (conversion == "pos_tag"):
        df['line_text'] = pos_tag(df)


    return df
        

In [12]:
# Parameters
param_dict = {
    "concat_scenes": True,
    "extract_direc": True, 
    "remove_punct": True, 
    "rmv_stopwords": True,
    "lwr": True, 
    "exp_contractions": True,
    "conversion": None
}


In [14]:
preprocessed_df = preprocess(df, **param_dict)

pd.set_option("display.max_colwidth", None)
preprocessed_df

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/english.pickle[0m

  Searched in:
    - 'C:\\Users\\ti-he/nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.2800.0_x64__qbz5n2kfra8p0\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.2800.0_x64__qbz5n2kfra8p0\\share\\nltk_data'
    - 'C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.10_3.10.2800.0_x64__qbz5n2kfra8p0\\lib\\nltk_data'
    - 'C:\\Users\\ti-he\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
    - ''
**********************************************************************


In [None]:
# feature extraction
def extract_features(df, vectorizer):
    if vectorizer == "binary":
        vectorizer = CountVectorizer(binary=True)
    elif vectorizer == "count":
        vectorizer = CountVectorizer() 
    elif vectorizer == "tfidf":
        vectorizer = TfidfVectorizer()
    elif vectorizer == "hashing":
        vectorizer = HashingVectorizer()

    result = vectorizer.fit_transform(df["line_text"])
    return result

def feature_selection (feature_df, selection_method):
    # TODO: add feature selection e.g. DF (document frequency)
    print("nothin here yet")

In [None]:
# feature extraction
feature_df = extract_features(df, "count")

In [None]:
# save the preprocessed data
df.to_csv(PATH+"preprocessed_"+FILE, sep=",", index=True)
feature_df.to_csv(PATH+"feature_"+FILE, sep=",", index=True)