In [23]:
import os
import numpy as np
import pandas as pd
import csv
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn import feature_selection, linear_model

# pd.set_option('display.max_rows', 10)
# pd.set_option('display.notebook_repr_html', True)
# pd.set_option('display.max_columns', 10)

%matplotlib inline
plt.style.use('ggplot')

In [24]:
#import dataset
users = pd.read_csv(os.path.join('datasets', 'airbnb_clean_2.csv'))

users.drop(["id", "account", "active", "booked", "active_to_booked"], axis=1, inplace=True)

users.head(3)

Unnamed: 0,gender,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,days
0,FEMALE,44.0,facebook,0,en,direct,direct,linked,Web,Mac Desktop,Safari,US,3.0
1,MALE,47.0,basic,0,en,sem-brand,google,tracked-other,Web,iPad,Mobile Safari,US,2.0
2,FEMALE,56.0,facebook,0,en,direct,direct,linked,Web,Mac Desktop,Safari,US,68.0


In [25]:
users[ ['age', 'days'] ].corr()

Unnamed: 0,age,days
age,1.0,-0.0143
days,-0.0143,1.0


# Dummy variables

#### Gender

In [26]:
users.gender.value_counts()

FEMALE    8575
MALE      6895
OTHER       38
Name: gender, dtype: int64

In [27]:
users = users[users.gender !='OTHER']

users.gender.value_counts()

FEMALE    8575
MALE      6895
Name: gender, dtype: int64

In [28]:
users_gender = pd.get_dummies(users.gender, prefix = 'Gender')

In [29]:
users_gender.head(3)

Unnamed: 0,Gender_FEMALE,Gender_MALE
0,1.0,0.0
1,0.0,1.0
2,1.0,0.0


In [30]:
users_gender.rename(columns={'Gender_FEMALE': 'female',
                        'Gender_MALE': 'male'}, inplace = True)

In [31]:
users_gender.head(3)

Unnamed: 0,female,male
0,1.0,0.0
1,0.0,1.0
2,1.0,0.0


In [32]:
users = users.join([users_gender])

In [33]:
users.drop(["gender"], axis=1, inplace=True)

In [34]:
users.columns

Index([u'age', u'signup_method', u'signup_flow', u'language',
       u'affiliate_channel', u'affiliate_provider', u'first_affiliate_tracked',
       u'signup_app', u'first_device_type', u'first_browser',
       u'country_destination', u'days', u'female', u'male'],
      dtype='object')

In [35]:
users[ ['female', 'male', 'days'] ].corr()

Unnamed: 0,female,male,days
female,1.0,-1.0,0.037687
male,-1.0,1.0,-0.037687
days,0.037687,-0.037687,1.0


#### Signup method

In [36]:
users.signup_method.value_counts()

basic       9478
facebook    5992
Name: signup_method, dtype: int64

In [37]:
users_signup_method = pd.get_dummies(users.signup_method, prefix = 'signup')

In [38]:
users_signup_method.head(3)

Unnamed: 0,signup_basic,signup_facebook
0,0.0,1.0
1,1.0,0.0
2,0.0,1.0


In [39]:
users = users.join([users_signup_method])

In [40]:
users.columns

Index([u'age', u'signup_method', u'signup_flow', u'language',
       u'affiliate_channel', u'affiliate_provider', u'first_affiliate_tracked',
       u'signup_app', u'first_device_type', u'first_browser',
       u'country_destination', u'days', u'female', u'male', u'signup_basic',
       u'signup_facebook'],
      dtype='object')

In [41]:
users[ ['signup_basic', 'signup_facebook', 'days'] ].corr()

Unnamed: 0,signup_basic,signup_facebook,days
signup_basic,1.0,-1.0,-0.065527
signup_facebook,-1.0,1.0,0.065527
days,-0.065527,0.065527,1.0


#### Language

In [42]:
users.language.value_counts()

en    15011
zh      101
fr       99
de       53
es       53
ko       43
ru       21
it       20
ja       19
pt       14
sv       11
no        6
da        5
nl        4
el        2
pl        2
tr        2
cs        1
fi        1
is        1
hu        1
Name: language, dtype: int64

In [43]:
users.language = np.where( users.language !='en', 'non-english', 'english' )

In [44]:
users.language.value_counts()

english        15011
non-english      459
Name: language, dtype: int64

In [21]:
# def language_groupings():
#     for i in users:
#         if users.language !='en':
#             replace(users.language.str, 'non-english')
#         else: 
#             replace(users.language.str, 'english')
#     return users

In [22]:
users['language'] = users.apply(lambda row: language_groupings)

In [28]:
users[ users.language !='en' ]

Unnamed: 0,age,signup_method,signup_flow,language,affiliate_channel,affiliate_provider,first_affiliate_tracked,signup_app,first_device_type,first_browser,country_destination,days,female,male,signup_basic,signup_facebook
18,36.0,facebook,0,hu,direct,direct,omg,Web,iPad,Mobile Safari,other,46.0,1.0,0.0,0.0,1.0
117,31.0,basic,0,it,direct,direct,linked,Web,Mac Desktop,Safari,GB,4.0,1.0,0.0,1.0,0.0
183,29.0,facebook,0,no,sem-brand,google,omg,Web,Mac Desktop,Safari,US,20.0,0.0,1.0,0.0,1.0
255,26.0,facebook,0,ja,direct,direct,linked,Web,Mac Desktop,Safari,US,1.0,0.0,1.0,0.0,1.0
260,35.0,basic,0,ko,direct,direct,linked,Web,Windows Desktop,IE,US,65.0,1.0,0.0,1.0,0.0
327,26.0,facebook,0,fr,direct,direct,linked,Web,Mac Desktop,Firefox,IT,102.0,0.0,1.0,0.0,1.0
356,49.0,facebook,0,nl,sem-non-brand,vast,omg,Web,Windows Desktop,IE,US,9.0,0.0,1.0,0.0,1.0
382,29.0,facebook,0,ja,direct,direct,linked,Web,Windows Desktop,Chrome,US,0.0,0.0,1.0,0.0,1.0
395,29.0,basic,0,de,seo,google,omg,Web,Mac Desktop,Safari,US,156.0,1.0,0.0,1.0,0.0
402,34.0,facebook,0,ru,direct,direct,linked,Web,Windows Desktop,Chrome,US,5.0,1.0,0.0,0.0,1.0


In [29]:
users_language = users [['language']]

In [31]:
users_language.head(3)

Unnamed: 0,language
0,en
1,en
2,en


In [34]:
users.groupby(users.language == 'en').sum()

Unnamed: 0_level_0,age,signup_flow,days,female,male,signup_basic,signup_facebook
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
False,14365.0,366,14051.0,248.0,211.0,223.0,236.0
True,555539.0,14858,444675.0,8327.0,6684.0,9255.0,5756.0


In [23]:
users.language.value_counts()

en    15011
zh      101
fr       99
de       53
es       53
ko       43
ru       21
it       20
ja       19
pt       14
sv       11
no        6
da        5
nl        4
el        2
pl        2
tr        2
cs        1
fi        1
is        1
hu        1
Name: language, dtype: int64

In [22]:
# for language, row in users.iterrows():
#     df.loc[language, "A"] = "I am working! {}".format(row["B"])

In [23]:
# users.language.replace(['zh', 'fr', 'de', 'es', 'ko'], 
#                      ['non-en'])

In [24]:
# # #definitely could write this better
# users.language = users.language.str.replace('zh','non-english')
# users.language = users.language.str.replace('fr','non-english')
# users.language = users.language.str.replace('de','non-english')
# users.language = users.language.str.replace('es','non-english')
# users.language = users.language.str.replace('ko','non-english')
# users.language = users.language.str.replace('ru','non-english')
# users.language = users.language.str.replace('it','non-english')
# users.language = users.language.str.replace('ja','non-english')
# users.language = users.language.str.replace('pt','non-english')
# users.language = users.language.str.replace('sv','non-english')
# users.language = users.language.str.replace('no','non-english')
# users.language = users.language.str.replace('da','non-english')
# users.language = users.language.str.replace('nl','non-english')
# users.language = users.language.str.replace('el','non-english')
# users.language = users.language.str.replace('pl','non-english')
# users.language = users.language.str.replace('tr','non-english')
# users.language = users.language.str.replace('cs','non-english')
# users.language = users.language.str.replace('fi','non-english')
# users.language = users.language.str.replace('is','non-english')
# users.language = users.language.str.replace('hu','non-english')

In [23]:
# users_language = pd.get_dummies(users.language, prefix = 'language')