# Preprocessing - "The Office" dataset
This notebook aims to provide parameterizable functions to preprocess the "The Office" dataset for further NLP analysis. 

In [21]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
import contractions

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer


PATH = "../data/"
FILE = "unmod_the-office-lines - scripts.csv"

In [44]:
df = pd.read_csv(PATH+FILE, sep=",", index_col="id")

In [4]:
# concatenate line_text for each scene
def concatenate_scenes(df):
    df = df.groupby(["season", "episode", "scene"])["line_text"].apply(lambda x: " ".join(x)).reset_index()
    return df

In [5]:
def extract_directorals(df):
    # extract text from line_text in square brackets, put it in new column called "directionals", multiple square brackets will be extracted as a list
    df["directionals"] = df["line_text"].str.extractall(r"\[(.*?)\]").unstack().apply(lambda x: ", ".join(x.dropna()), axis=1)
    # delete the extracted text from line_text
    df["line_text"] = df["line_text"].str.replace(r"\[(.*?)\]", "", regex=True).str.strip()
    return df


In [27]:
# bare string preprocessing
def remove_punctuation(df):
    return df["line_text"].str.replace(r"[^\w\s]", "", regex=True)

def lower(df):
    return df["line_text"].apply(lambda x: x.lower())

def remove_stopwords(df):
    stop_words = set(stopwords.words('english'))
    return df["line_text"].apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in stop_words]))

def expanding_contractions(df):
    return df["line_text"].apply(lambda x: contractions.fix(x))


In [7]:
def tokenize(df):
    return df["line_text"].apply(lambda x: word_tokenize(x))

# TODO: make this paremeterizable
def segmentation(df):
    return df["line_text"].apply(lambda x: x.split(" "))

def lemmatize(df):
    wordnet_lemmatizer = WordNetLemmatizer()
    # is working, but not very good results because of the simple speech of the characters
    return df["line_text"].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

def stem(df):
    porter_stemmer = PorterStemmer()
    return df["line_text"].apply(lambda x: " ".join([porter_stemmer.stem(word) for word in word_tokenize(x)]))

# tagging
def pos_tag(df):
    return df["line_text"].apply(lambda x: nltk.pos_tag(word_tokenize(x)))

In [82]:
def preprocess(df, concat_scenes=False, extract_direc=False, remove_punct=False, rmv_stopwords=False, lwr=False, exp_contractions=False, conversion=None):
    if (concat_scenes):
        df = concatenate_scenes(df)
    if (extract_direc):
        df = extract_directorals(df)

    if (remove_punct):
        df['line_text'] = remove_punctuation(df)
    if (lwr):
        df['line_text'] = lower(df)
    if (rmv_stopwords):
        df['line_text'] = remove_stopwords(df)
    if (exp_contractions):
        df['line_text'] = expanding_contractions(df)   

    if (conversion == "tokenize"):
        df['line_text']  = tokenize(df)
    elif (conversion == "segment"):
        df['line_text'] = segmentation(df)
    elif (conversion == "lemmatize"):
        df['line_text'] = lemmatize(df)
    elif (conversion == "stem"):
        df['line_text'] = stem(df)
    elif (conversion == "pos_tag"):
        df['line_text'] = pos_tag(df)


    return df
        

In [80]:
# Parameters
param_dict = {
    "concat_scenes": True,
    "extract_direc": True, 
    "remove_punct": True, 
    "rmv_stopwords": True,
    "lwr": True, 
    "exp_contractions": True,
    "conversion": None
}


In [83]:
preprocessed_df = preprocess(df, **param_dict)

pd.set_option("display.max_colwidth", None)
preprocessed_df

Unnamed: 0,season,episode,scene,line_text,directionals
0,1,1,1,right jim quarterlies look good things library oh told could not close you have come master guidance you are saying grasshopper actually called yeah right well let show done,
1,1,1,2,yes id like speak office manager please yes hello michael scott regional manager dunder mifflin paper products wanted talk manageramanger right done deal thank much sir you are gentleman scholar oh i am sorry ok i am sorry mistake woman talking low voice probably smoker that is way done,"on the phone, quick cut scene, hangs up, Clears throat"
2,1,1,3,i have uh i have dunder mifflin 12 years last four regional manager want come see entire floor kingdom far eye see receptionist pam pam pampam pam beesly pam us forever right pam well do not know think she is cute seen couple years ago messages uh yeah fax oh pam corporate many times told there is special filing cabinet things corporate have not told called wastepaper basket look look face,growls
3,1,1,4,people say best boss go god we have never worked place like you are hilarious get best us think pretty much sums found spencer gifts,shows the camera his WORLD'S BEST BOSS mug
4,1,1,5,shall play pa rum pump um pum gifts pa rum pump um pum,"singing, Imitates heavy drumming, Imitates heavy drumming"
...,...,...,...,...,...
9156,9,23,112,seems arbitrary applied job company hiring took desk back empty matter get end human beings miraculous gift make place home let,"chuckles, standing with two cops"
9157,9,23,113,feel lucky got chance share crummy story anyone thinks one take dump paper shredder alone sister let get beer sometime,
9158,9,23,114,happy filmed remember everyone worked paper company years never wrote anything,
9159,9,23,115,sold paper company 12 years job speak clients phone quantities types copier paper even love every minute everything owe job stupid wonderful boring amazing job,


In [None]:
# feature extraction
def extract_features(df, vectorizer):
    if vectorizer == "binary":
        vectorizer = CountVectorizer(binary=True)
    elif vectorizer == "count":
        vectorizer = CountVectorizer() 
    elif vectorizer == "tfidf":
        vectorizer = TfidfVectorizer()
    elif vectorizer == "hashing":
        vectorizer = HashingVectorizer()

    result = vectorizer.fit_transform(df["line_text"])
    return result

def feature_selection (feature_df, selection_method):
    # TODO: add feature selection e.g. DF (document frequency)
    print("nothin here yet")

In [None]:
# feature extraction
feature_df = extract_features(df, "count")

In [None]:
# save the preprocessed data
df.to_csv(PATH+"preprocessed_"+FILE, sep=",", index=True)
feature_df.to_csv(PATH+"feature_"+FILE, sep=",", index=True)