# Preprocessing - "The Office" dataset
This notebook aims to provide parameterizable functions to preprocess the "The Office" dataset for further NLP analysis. 

In [2]:
import numpy as np
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import HashingVectorizer


PATH = "../data/"
FILE = "unmod_the-office-lines - scripts.csv"

In [3]:
df = pd.read_csv(PATH+FILE, sep=",", index_col="id")

In [4]:
def extract_directorals(df):
    # extract text from line_text in square brackets, put it in new column called "directionals", multiple square brackets will be extracted as a list
    df["directionals"] = df["line_text"].str.extractall(r"\[(.*?)\]").unstack().apply(lambda x: ", ".join(x.dropna()), axis=1)
    # delete the extracted text from line_text
    df["line_text"] = df["line_text"].str.replace(r"\[(.*?)\]", "", regex=True).str.strip()
    return df


In [5]:
# bare string preprocessing
def remove_punctuation(df):
    return df["line_text"].str.replace(r"[^\w\s]", "", regex=True)

def normalize(df):
    return df["line_text"].apply(lambda x: x.lower())

def segmentation(df):
    return df["line_text"].apply(lambda x: x.split(" "))

In [35]:
# tokenization
def tokenize(df):
    return df["line_text"].apply(lambda x: word_tokenize(x))

def lemmatize(df):
    wordnet_lemmatizer = WordNetLemmatizer()
    # is working, but not very good results because of the simple speech of the characters
    return df["line_text"].apply(lambda x: " ".join([wordnet_lemmatizer.lemmatize(word) for word in word_tokenize(x)]))

def stem(df):
    porter_stemmer = PorterStemmer()
    return df["line_text"].apply(lambda x: " ".join([porter_stemmer.stem(word) for word in word_tokenize(x)]))

def remove_stopwords(df):
    stop_words = set(stopwords.words('english'))
    return df["line_text"].apply(lambda x: " ".join([word for word in word_tokenize(x) if word not in stop_words]))

# tagging
def pos_tag(df):
    return df["line_text"].apply(lambda x: nltk.pos_tag(word_tokenize(x)))

In [7]:
# feature extraction
def extract_features(df, vectorizer):
    if vectorizer == "count":
        vectorizer = CountVectorizer() 
    elif vectorizer == "tfidf":
        vectorizer = TfidfVectorizer()
    elif vectorizer == "hashing":
        vectorizer = HashingVectorizer()

    result = vectorizer.fit_transform(df["line_text"])
    return result
    

In [None]:
# TODO: make a function that sums all functions up and makes parametrized preprocessing easier
# TODO: implement pipeline for preprocessing

# ideas of possible preprocessing steps: https://medium.com/analytics-vidhya/nlp-preprocessing-pipeline-what-when-why-2fc808899d1f

In [None]:
# preprocessing
df = extract_directorals(df)
df["line_text"] = remove_punctuation(df)
df["line_text"] = remove_stopwords(df)

df["line_text"] = lemmatize(df)

# feature extraction
feature_df = extract_features(df, "count")

In [None]:
# save the preprocessed data
df.to_csv(PATH+"preprocessed_"+FILE, sep=",", index=True)
feature_df.to_csv(PATH+"feature_"+FILE, sep=",", index=True)