# Preprocessing - "The Office" dataset
This notebook aims to provide parameterizable functions to preprocess the "The Office" dataset for further NLP analysis. 

In [28]:
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer


PATH = "../data/"
FILE = "unmod_the-office-lines - scripts.csv"

In [26]:
df = pd.read_csv(PATH+FILE, sep=",", index_col="id")

In [7]:
def extract_directorals(df):
    # extract text from line_text in square brackets, put it in new column called "directionals", multiple square brackets will be extracted as a list
    df["directionals"] = df["line_text"].str.extractall(r"\[(.*?)\]").unstack().apply(lambda x: ", ".join(x.dropna()), axis=1)
    # delete the extracted text from line_text
    df["line_text"] = df["line_text"].str.replace(r"\[(.*?)\]", "", regex=True).str.strip()
    return df


In [25]:
def remove_stopwords(df):
    stop_words = set(stopwords.words('english'))
    return df["line_text"].apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in stop_words]))

def remove_punctuation(df):
    return df["line_text"].str.replace(r"[^\w\s]", "", regex=True)

def tokenize(df):
    return df["line_text"].apply(lambda x: word_tokenize(x))

def segmentation(df):
    return df["line_text"].apply(lambda x: x.split(" "))

def normalize(df):
    return df["line_text"].apply(lambda x: x.lower())

def lemmatize(df):
    wordnet_lemmatizer = WordNetLemmatizer()
    return df["line_text"].apply(lambda x: wordnet_lemmatizer.lemmatize(x))

def stem(df):
    porter_stemmer = PorterStemmer()
    return df["line_text"].apply(lambda x: porter_stemmer.stem(x))


In [None]:
# feature extraction
def extract_features(df, vectorizer):
    if vectorizer == "count":
        vectorizer = CountVectorizer() 
    elif vectorizer == "tfidf":
        vectorizer = TfidfVectorizer()
    elif vectorizer == "hashing":
        vectorizer = HashingVectorizer()

    result = vectorizer.fit_transform(df["line_text"])
    return result
    

In [None]:
# TODO: make a function that sums all functions up and makes parametrized preprocessing easier

In [None]:
# save the preprocessed data
df.to_csv(PATH+"preprocessed_"+FILE, sep=",", index=True)