In [1]:
import pandas as pd

# Read the csv file of thanksgiving dinner survey
data = pd.read_csv("datasets/thanksgiving.csv", encoding = "Latin-1")

In [2]:
# Display column names
col_names = data.columns
print(col_names)

Index([u'RespondentID', u'Do you celebrate Thanksgiving?',
       u'What is typically the main dish at your Thanksgiving dinner?',
       u'What is typically the main dish at your Thanksgiving dinner? - Other (please specify)',
       u'How is the main dish typically cooked?',
       u'How is the main dish typically cooked? - Other (please specify)',
       u'What kind of stuffing/dressing do you typically have?',
       u'What kind of stuffing/dressing do you typically have? - Other (please specify)',
       u'What type of cranberry saucedo you typically have?',
       u'What type of cranberry saucedo you typically have? - Other (please specify)',
       u'Do you typically have gravy?',
       u'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Brussel sprouts',
       u'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Carrots',
       u'Which of these side dishes arety

In [3]:
# Keeping data for people who celebrate thanksgiving
# (Keeping only the rows where the response is "Yes")
celebrates_thanksgiving = data["Do you celebrate Thanksgiving?"] == "Yes"
thanksgiving_celebrate_data = data.loc[celebrates_thanksgiving, :]

# Cross verifying 
print(data["Do you celebrate Thanksgiving?"].value_counts())
print((thanksgiving_celebrate_data.shape)[0])

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64
980


In [4]:
# Exploring main dishes
print(thanksgiving_celebrate_data["What is typically the main dish at your Thanksgiving dinner?"].value_counts())

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64


In [11]:
# Exploring gravy Tofurkey dish
is_dish_tofurkey = thanksgiving_celebrate_data["What is typically the main dish at your Thanksgiving dinner?"] == "Tofurkey"
tofurkey = thanksgiving_celebrate_data.loc[is_dish_tofurkey,:]

# Display counts of gravy and non-gravy tofurkey dishes
tofurkey["Do you typically have gravy?"].value_counts()

Yes    12
No      8
Name: Do you typically have gravy?, dtype: int64

In [12]:
# Exploring Pies ate on thanksgiving
apple_isnull = thanksgiving_celebrate_data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple"].isnull()
pumpkin_isnull = thanksgiving_celebrate_data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin"].isnull()
pecan_isnull = thanksgiving_celebrate_data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan"].isnull()

ate_pies = (apple_isnull) & (pumpkin_isnull) & (pecan_isnull)

# Fraction of people who didn't eat any of the pie
print(ate_pies.value_counts(normalize = True))

False    0.893878
True     0.106122
dtype: float64


In [13]:
# Converting age to numbers
def convert_age_to_int(string):
    # If string is null -  then return None
    if pd.isnull(string):
        return None
    else:
        # Otherwise split at space and extract the 
        # first element
        strings = string.split(" ")
        age = strings[0]
        # If "Age" is of the format of "60+", then
        # remove the + sign
        age = age.replace("+", "")
        return int(age)

In [14]:
ages = data["Age"]
data["int_age"] = ages.apply(convert_age_to_int)
data["int_age"].describe()

count    1025.000000
mean       39.383415
std        15.398493
min        18.000000
25%              NaN
50%              NaN
75%              NaN
max        60.000000
Name: int_age, dtype: float64

In [15]:
# Converting income to integers
def convert_income_to_int(string):
    # If the income col is emply, then return None
    if pd.isnull(string):
        return None
    else:
        # Otherwise split income string at space
        # and extract first word
        strings = string.split(" ")
        first_word = strings[0]
        
        if first_word == "Prefer":
            return None
        else:
            # Remove "," and "$"
            first_word = first_word.replace("$", "")
            first_word = first_word.replace(",", "")
            
            # convert string to int
            income = int(first_word)
            return income

In [16]:
# Extract incomes column
incomes = data["How much total combined money did all members of your HOUSEHOLD earn last year?"]

# Convert incomes to integers
data["int_income"] = incomes.apply(convert_income_to_int)

data["int_income"].describe()

count       889.000000
mean      74077.615298
std       59360.742902
min           0.000000
25%                NaN
50%                NaN
75%                NaN
max      200000.000000
Name: int_income, dtype: float64

In [17]:
# Find relation between income and travel data for
# thanksgiving (hypothesis: people earning less are
# young and will celebrate thanksgiving at parents
# house and people earning more will celebrate at 
# their own house)

# Extract data for income less than 150,000
is_income_less_150k = data["int_income"] < 150000
income_less_150k_data = data.loc[is_income_less_150k]

dist_travel_income_less_150k = income_less_150k_data["How far will you travel for Thanksgiving?"]
dist_travel_income_less_150k.value_counts()

Thanksgiving is happening at my home--I won't travel at all                         281
Thanksgiving is local--it will take place in the town I live in                     203
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    150
Thanksgiving is out of town and far away--I have to drive several hours or fly       55
Name: How far will you travel for Thanksgiving?, dtype: int64

In [18]:
# Extract data for income more than 150,000
is_income_more_150k = data["int_income"] > 150000
income_more_150k_data = data.loc[is_income_more_150k]

dist_travel_income_more_150k = income_more_150k_data["How far will you travel for Thanksgiving?"]
dist_travel_income_more_150k.value_counts()

Thanksgiving is happening at my home--I won't travel at all                         49
Thanksgiving is local--it will take place in the town I live in                     25
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    16
Thanksgiving is out of town and far away--I have to drive several hours or fly      12
Name: How far will you travel for Thanksgiving?, dtype: int64

In [19]:
# Linking friendship and age

friendship_and_age = data.pivot_table(index = "Have you ever tried to meet up with hometown friends on Thanksgiving night?", 
                                     columns = 'Have you ever attended a "Friendsgiving?"',
                                     values = "int_age")

friendship_and_age.index.name = "Meetup hometown friends?"
friendship_and_age.columns.name = "Friendsgiving?"
print(friendship_and_age)

Friendsgiving?                   No        Yes
Meetup hometown friends?                      
No                        42.283702  37.010526
Yes                       41.475410  33.976744


In [20]:
# Linking friendship and income

friendship_and_income = data.pivot_table(index = "Have you ever tried to meet up with hometown friends on Thanksgiving night?", 
                                         columns = 'Have you ever attended a "Friendsgiving?"',
                                         values = "int_income")
friendship_and_income.index.name = "Meetup hometown friends?"
friendship_and_income.columns.name = "Friendsgiving?"
print(friendship_and_income)

Friendsgiving?                      No           Yes
Meetup hometown friends?                            
No                        78914.549654  72894.736842
Yes                       78750.000000  66019.736842
