In [1]:
import pandas as pd
import numpy as np

In [2]:
def stringToBinaryDict (array):
    dictionary = {}
    uniq_array = array.unique()
    arraySize = uniq_array.size # find out the unique size of an array
    
    if arraySize > 2:
        # normal
        for i in range(arraySize):
            alist =  [0 for j in range(arraySize)]
            alist[i] = 1
            dictionary[uniq_array[i]] = alist
    elif arraySize == 2:
        # gender
        dictionary[uniq_array[0]] = 1
        dictionary[uniq_array[1]] = -1
    
    return uniq_array, dictionary

In [3]:
def clean_data(filename):
    gender_column = "What gender are you?"
    income_column = "What is your income range?"
    music_column = "What types of movies do you like? (select all that apply)"
    internet_column = "What is the average number of hours that you spend on the Internet per day?"
    children_column = "How many children do you have?"
    vehicle_column = "Do you or your family own a vehicle and if so, how many?"
    age_column = "Which age group are you in?"
    split_columns = ["ancestry", "residence", "ethnicity", "gender", "degree", "career", "music", "maritality"]

    # preprocessing
    data = pd.read_csv(filename)
    data = data.rename(index=str, columns = {data.columns[1]: "Gender Result", data.columns[28]: "Politics Result"})
    data = data.drop(data.index[117])
    split_columns_dictionary = {}
    
    # gender questions
    for i in range(1, 21):
        column = data.iloc[:, i].div(100)
        
        if (i == 1):
            data.iloc[:, i] = column
        else:
            data.iloc[:, 1] = data.iloc[:, 1] + column
    
    data.iloc[:, 1] = data.iloc[:, 1].div(20)
    
    data[gender_column] = data[gender_column].replace("Other", np.nan)
    data[gender_column] = data.apply(lambda row: ("Man" if row["Gender Result"] >= 0 else "Woman") 
                                     if pd.isnull(row[gender_column]) 
                                     else row[gender_column], axis=1)
    
    # categorical multiple choices
    categorical_range = range(21, 26)
    categorical_range.append(27)
    categorical_range.append(48)
    categorical_range.append(52)
    
    for index in range(0, len(categorical_range)):
        i = categorical_range[index]
        data.iloc[:, i] = data.iloc[:, i].fillna(data.iloc[:, i].mode()[0])
        uniq, dictionary = stringToBinaryDict(data.iloc[:, i])
        split_columns_dictionary[split_columns[index]] = uniq
        data.iloc[:, i] = data.iloc[:, i].map(dictionary)
    
    # income question
    income_dictionary = {"Below $10,000": 0, "$10,000 - $30,000": 1, "$30,000 - $50,000": 2, "$50,000 - $70,000": 3,
                         "$70,000 - $90,000": 4, "Above $90,000": 5}
    income = data[income_column].map(income_dictionary)
    data[income_column] = (income - income.mean()) / income.std()
    
    # internet question
    internet_dictionary = {"less than 1 hour": 1, "1-3 hours" : 2, "3-4 hours": 3, "4 or more hours": 4}
    internet = data[internet_column].map(internet_dictionary)
    data[internet_column] = (internet - internet.mean()) / internet.std()
    
    # children question
    children_dictionary = {"I do not have children": 0, "1": 1, "2": 2, "More than 2": 3}
    children = data[children_column].map(children_dictionary)
    data[children_column] = (children - children.mean()) / children.std()
    
    # vehicle question
    vehicle_dictionary = {"Neither me nor my family owns a vehicle.": 0, "1": 1, "2": 2, "3 or more": 3}
    vehicle = data[vehicle_column].map(vehicle_dictionary)
    data[vehicle_column] = (vehicle - vehicle.mean()) / vehicle.std()
    
    # age question
    age_dictionary = {"Under 20 years of age": 1, "20 - 30 years of age": 2, "30 - 40 years of age": 3, 
                      "40 - 50 years of age": 4, "50 or more years of age": 5}
    age = data[age_column].map(age_dictionary)
    data[age_column] = (age - age.mean()) / age.std()
    
    # politics questions
    for i in range(28, 38):
        column = data.iloc[:, i].div(100)
        
        if i == 28:
            data.iloc[:, i] = column
        else:
            data.iloc[:, 28] = data.iloc[:, 28] + column
    
    data.iloc[:, 28] = data.iloc[:, 1].div(20)
    
    
    # select all apply questions
    data.iloc[:, 39:48] = data.iloc[:, 39:48].fillna(0).replace(regex=r'^\w+', value=1)
        
    # joke ratings
    for i in range(53, 174):
        data.iloc[:, i] = data.iloc[:, i].div(100)

    # drop unnecessary columns
    data = data.drop(data.columns[29: 38], axis = 1)
    data = data.drop(data.columns[2: 21], axis = 1)

    #change column name
    data.rename(columns={ data.columns[11]: "Romance", data.columns[12]: "Horror", data.columns[13]: "Comedy", data.columns[14]: "Drama",
                         data.columns[15]: "Historical", data.columns[16]: "Animation", data.columns[17]: "Documentary", data.columns[18]: "Adventure",
                         data.columns[19]: "Fiction" }, inplace=True)
    print zip(data.columns[25:-1], range(1, len(data.columns)-25))
    rename_dict = dict(zip(data.columns[25:-1], range(1, len(data.columns)-25)))
    data.rename(columns=rename_dict, inplace=True)
    
    #reconstruct dataframe
    new_data = pd.DataFrame(data.iloc[:, 0:2], index = data.index)
    new_data[split_columns_dictionary["ancestry"]] = pd.DataFrame(data.iloc[:, 2].values.tolist(), index = data.index)
    new_data[split_columns_dictionary["residence"]] = pd.DataFrame(data.iloc[:, 3].values.tolist(), index = data.index)
    new_data[split_columns_dictionary["ethnicity"]] = pd.DataFrame(data.iloc[:, 4].values.tolist(), index = data.index)
    new_data = new_data.join(data.iloc[:, 5])
    new_data[split_columns_dictionary["degree"]] = pd.DataFrame(data.iloc[:, 6].values.tolist(), index = data.index)
    new_data = new_data.join(data.iloc[:,7])
    new_data[split_columns_dictionary["career"]] = pd.DataFrame(data.iloc[:, 8].values.tolist(), index = data.index)
    new_data = new_data.join(data.iloc[:,9:11])
    new_data = new_data.join(data.iloc[:,11:20])
    new_data[split_columns_dictionary["music"]] = pd.DataFrame(data.iloc[:, 20].values.tolist(), index = data.index)
    new_data = new_data.join(data.iloc[:, 21:24])
    new_data[split_columns_dictionary["maritality"]] = pd.DataFrame(data.iloc[:, 24].values.tolist(), index = data.index)
    new_data = new_data.join(data.iloc[:, 25:])
    
    return new_data

clean_data("data.csv")

[("Q: Why did the girl fall off the swing? A: She didn't have any arms.", 1), ('There were two sausages in a pan. One of them said to the other, "Blimey, it\'s hot in here," and the other one said, "Aaaah! A talking sausage!"', 2), ('I\xe2\x80\x99ve read so many horrible things about drinking and smoking recently that I made a new, firm New Year\xe2\x80\x99s resolution: NO MORE READING!', 3), ('Q: Why was 6 afraid of 7? A: Because 7 was a registered 6 offender.', 4), ('Q: Whats green and smells like red paint? A: Green paint.', 5), ('There were two parrots on a perch. One of them said to the other: "Can you smell fish?"', 6), ('How funny did you find the above image?', 7), ('They say you can\xe2\x80\x99t get a decent job without education. But look at Albert Einstein \xe2\x80\x93 he was a drop-out and still ended up being the first man on the moon!', 8), ("A duck goes into a bar and says, 'Pint of bitter and a bag of your excellent salt and vinegar crisps.' The barman responds, 'You sp

Unnamed: 0,Respondent ID,Gender Result,Africa,East Asia and the Pacific,Europe and Central Asia,South Asia,Middle East and North Africa,Latin America and The Carribean,West Coast,The Midwest,...,113,114,115,116,117,118,119,120,121,MTurk Worker ID
0,10020787097,0.1615,1,0,0,0,0,0,1,0,...,-1.00,-1.00,0.01,-1.00,-1.00,-0.52,-1.00,-1.00,-1.00,3ATDVKQJ43U72A
1,10020687640,0.4720,0,1,0,0,0,0,1,0,...,-0.64,-0.97,-0.91,-0.56,-0.99,0.51,-0.99,-0.79,-0.76,A667L63P4M8NO
2,10020467615,0.5475,0,0,1,0,0,0,0,1,...,-1.00,-0.26,0.69,0.30,-0.77,0.49,-0.72,0.31,0.19,A2JMIAOV67CPHI
3,10020014023,0.3385,0,0,1,0,0,0,1,0,...,-0.74,-0.97,0.92,0.29,-0.95,-0.92,-0.86,0.56,0.20,Zritmmpf554a3a
4,10018012695,0.2310,0,0,0,1,0,0,0,1,...,-0.76,-0.82,0.89,0.49,-1.00,0.27,-0.89,-0.60,-0.36,A2AO7QP5THYKQF
5,10017587012,0.1340,0,0,1,0,0,0,1,0,...,-0.84,-0.83,-0.78,-0.71,0.48,-0.90,-0.95,-0.85,-0.78,P1JQDFTAE25A2A
6,10017567998,0.5425,1,0,0,0,0,0,0,0,...,-0.97,-0.29,0.76,0.51,-0.99,0.20,-0.89,0.48,0.13,A5V3ZMQI0PU3F
7,10017339230,0.4160,0,0,1,0,0,0,0,1,...,-0.97,-0.98,0.19,-0.37,-0.96,0.20,-0.58,0.00,0.13,A4JU6ZFL8C1VG
8,10017302977,0.4620,0,0,1,0,0,0,0,0,...,-1.00,-1.00,-0.83,-0.53,0.54,-0.97,0.78,-0.88,0.72,A3TK9CLBA65ZZE
9,10017282204,0.3595,0,0,1,0,0,0,0,1,...,0.12,0.14,0.41,0.14,-0.07,0.19,0.00,0.05,0.09,A9WIOFVRSYW3L


In [5]:
import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from string import punctuation

# nltk.download()
# nltk.download('punkt')
# if you have not downloaded the nltk corpus, then uncomment the lines above

In [8]:
def parseJoke(filename):
    data = pd.read_csv(filename)
    return data

def CreateMyStopWords ():
    stopword = stopwords.words("english")
    stopword.remove(u't')
    stopword.remove(u's')
    stopword.append(u"'s")
    stopword.append(u"'t")
    stopword.append(u"n't")
    stopword.append(u"'d")
    stopword.append(u"'re")
    stopword.append(u"cannot")
    stopword.append(u"'ll")
    stopword.append(u"'ve")
    stopword.append(u"'m")
    stopword.append(u"q")
    stopword.append(u"could")
    stopword.append(u"would")
    return stopword
    
def is_valid_hyphen_word(str):
    flag = False
    
    if str[0].isalpha() and str[len(str) - 1].isalpha():
        for chr in str:
            if chr.isalpha():
                flag = False
            elif chr == "-":
                if flag:
                    return False
                else:
                    flag = True
            else:
                return False
        return True
    return False

def DataCleaningForKaggleSA(data):
    stopword = CreateMyStopWords()
    porterStemmer = PorterStemmer()
    
    for i in range(len(data)):
        row = data.iloc[i]
        sentence = row["Joke"].replace("’", "'").lower()
        for chr in sentence:
            if (ord(chr) >= 128):
                sentence = sentence.replace(chr, '')
                
        words = word_tokenize(sentence)
        cleanData = []
        
        for w in words:
            if w not in stopword:
                if all(chr not in punctuation for chr in w) or is_valid_hyphen_word(w):
                    cleanData.append(porterStemmer.stem(w))
            
        cleanSentence = ' '.join(cleanData)
        data.set_value(i, "Joke", cleanSentence)
        
    return data

data = parseJoke("Jokes.csv")

DataCleaningForKaggleSA(data).to_csv("cleanedJokes.csv")

KeyError: 'Joke'