In [1]:
import pandas as pd
import seaborn as sns
import numpy as np

In [7]:
train_file = pd.read_csv('train.csv')

In [8]:
#Finding NaN values in the dataset
train_nan = train_file[train_file.isna().any(axis=1)] 

In [9]:
#looking to fill NaN values in travel_with with 'Alone'
train_nan[['travel_with','total_female','total_male']] 

Unnamed: 0,travel_with,total_female,total_male
24,,0.0,1.0
81,,0.0,1.0
94,,0.0,1.0
115,,0.0,1.0
121,,1.0,0.0
...,...,...,...
18455,,1.0,0.0
18476,,0.0,1.0
18485,,0.0,1.0
18486,,1.0,0.0


In [10]:
#Filling all null travel with cells with 'Alone'
train_file['travel_with'].fillna('Alone', inplace=True) 

In [11]:
#Dropping incompatable empty value
train_file.drop(train_file.index[316], inplace=True) 

In [12]:
#Dropping all null female values
train_file['total_female'].dropna() 

0        0.0
1        1.0
2        1.0
3        3.0
4        0.0
        ... 
18501    0.0
18502    1.0
18503    2.0
18504    1.0
18505    2.0
Name: total_female, Length: 18503, dtype: float64

In [13]:
#Dropping all male null values
train_file['total_male'].dropna() 

0        2.0
1        1.0
2        1.0
3        1.0
4        1.0
        ... 
18501    1.0
18502    0.0
18503    1.0
18504    1.0
18505    1.0
Name: total_male, Length: 18499, dtype: float64

In [14]:
#Replacing all yes/no with 1/0 respectively
train_file.replace(['Yes', 'No'], [1, 0], inplace=True) 

In [15]:
#making total packages per tour column
train_file['total_packages'] = train_file['package_transport_int'] + train_file['package_accomodation'] + \
train_file['package_food'] + train_file['package_transport_tz'] + train_file['package_sightseeing'] + \
+ train_file['package_guided_tour'] + train_file['package_insurance']

In [16]:
#making total people per tour column
train_file['total_people'] = train_file['total_female'] + train_file['total_male']

In [17]:
#makign total days column
train_file['total_days'] = train_file['night_mainland'] + train_file['night_zanzibar']

In [18]:
#Finding outliers in total people
total_people = train_file['total_people'].unique()
sorted_total_people = np.sort(total_people)

In [19]:
#Number of occurences for amount of people on the trip
people_counts = train_file['total_people'].value_counts()

In [20]:
#remove parties > 50
num_people = train_file[train_file['total_people'] > 50].index
train_file.drop(num_people , inplace=True)

In [21]:
#remove outliers for mainland? DISPLOT 
#sns.displot(train_file['night_mainland'])

In [22]:
#Finding outliers in mainland nights(discuss)
mainland_nights = train_file['night_mainland'].unique() 
sorted_mainland_nights = np.sort(mainland_nights)

In [23]:
#remove outliers for zanzibar? DISPLOT
#sns.displot(train_file['night_zanzibar'])

In [24]:
#Finding outliers in zanzibar nights(discuss)
zanibar_nights = train_file['night_zanzibar'].unique()
sorted_zanzibar_nights = np.sort(zanibar_nights)

In [25]:
#All countries into a list
#all_countries = train_file.country.unique().tolist()

In [26]:
#Getting countries where their occurence is less than 10
low_country_count = train_file['country'].value_counts().to_frame()
low_country_count = low_country_count[low_country_count['country'] < 10].index.tolist()

In [27]:
#filtering out low countries from countries
#high_country_count = [b for b in all_countries if all(a not in b for a in low_country_count)]

In [28]:
#making a continent column
europe = ['ITALY', 'UNITED KINGDOM', 'FRANCE', 'SWIZERLAND', 'SPAIN', 'DENMARK', 'BELGIUM', 'NETHERLANDS', 'NORWAY',
         'GERMANY','RUSSIA', 'GREECE','POLAND','AUSTRIA','CZECH REPUBLIC', 'IRELAND','SWEDEN', 'FINLAND','LUXEMBOURG', 'ROMANIA', 
         'PORTUGAL', 'SCOTLAND','SLOVAKIA','UKRAIN', 'HUNGARY','TURKEY']
north_america = ['UNITED STATES OF AMERICA', 'CANADA',]
latin_america = ['BRAZIL', 'ARGENTINA', 'URUGUAY','MEXICO', ]
asia = ['INDIA', 'CHINA', 'JAPAN', 'MALAYSIA', 'KOREA', 'THAILAND', 'SINGAPORE', 'PAKISTAN', 'TAIWAN', 'PHILIPINES',]
middle_east = ['LEBANON', 'ISRAEL', 'UNITED ARAB EMIRATES', 'QATAR', 'OMAN', 'UAE', 'SAUD ARABIA',]
oceania = ['AUSTRALIA', 'NEW ZEALAND',]
africa = ['RWANDA','NIGERIA', 'ZAMBIA', 'ZIMBABWE', 'SOUTH AFRICA', 'KENYA', 'DRC', 'UGANDA', 'CONGO', 'BURUNDI', 'NAMIBIA',
         'SUDAN', 'MALAWI','EGYPT', 'SRI LANKA','BOTSWANA', 'COMORO', 'ETHIOPIA','SWAZILAND',
 'MOZAMBIQUE', 'MAURITIUS']
other = ['CHILE', 'SERBIA', 'BAHRAIN', 'COLOMBIA', 'VIETNAM', 'MALT', 'KUWAIT', 'INDONESIA','GHANA', 'TRINIDAD TOBACCO',
         'CROATIA','BULGARIA','MADAGASCAR','CAMEROON','ICELAND','LATVIA','LITHUANIA','ALGERIA','MORROCO','BERMUDA','SLOVENIA',
         'SOMALI','YEMEN','VENEZUELA','ANGOLA','SEYCHELLES','NEPAL','TANZANIA','SENEGAL','JAMAICA','GAMBIA','PERU','IRAN',
         'LESOTHO','COSTARICA','GEORGIA','BOSNIA','DOMINICA','CAMBODIA','BURGARIA','AFGHANISTAN','ECUADO','PAPUA NEW GUINEA','ARMENIA',
         'CYPRUS','MACEDONIA','IVORY COAST','MONECASQUE','BARBADOS','MONTENEGRO','DJIBOUT','TUNISIA','LIBERIA','BANGLADESH','ERITREA',
         'NIGER','ESTONIA','CAPE VERDE','JORDAN']

conditions = [train_file['country'].isin(europe), 
        train_file['country'].isin(north_america), train_file['country'].isin(latin_america),
        train_file['country'].isin(asia), train_file['country'].isin(middle_east), 
        train_file['country'].isin(oceania), train_file['country'].isin(africa), 
         train_file['country'].isin(other)]

values = ['EUROPE', 'NORTH AMERICA', 'LATIN AMERICA', 'ASIA', 'MIDDLE EAST', 'OCEANIA', 'AFRICA', 'OTHER']
train_file['continent'] = np.select(conditions, values)

In [29]:
#Replacing cost categories with representative numerical values
train_file.replace({'Lower Cost' : 1, 
                    'Low Cost' : 2, 
                    'Normal Cost' : 3, 
                    'High Cost' : 4, 
                    'Higher Cost' : 5, 
                    'Highest Cost' : 6}, inplace=True)

In [30]:
#getting the correlations of the entire training table
training_correlations = train_file.corr()

  training_correlations = train_file.corr()


In [31]:
#getting the correlations of the cost category
training_corr_series = training_correlations['cost_category']

In [32]:
#getting correlations over 0.5
pos_cost_correlations = training_correlations[training_correlations['cost_category'] > 0.5]['cost_category']

In [33]:
#Counting/normalizing cost categories across continents
cont_vs_cost = pd.crosstab(train_file.continent, train_file.cost_category, normalize='columns')

In [34]:
#getting mean cost_category across continents
cont_cost_mean = train_file.groupby('continent', as_index=False).cost_category.mean()

In [35]:
#descriptive stats based on continent
descriptive_cont = train_file.groupby('continent').cost_category.describe()

In [36]:
#correlations on attributes over 0.5 grouped by continent
cont_pos_corr = train_file.groupby('continent')[['package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                                'total_packages', 'cost_category']].corr()['cost_category']

In [37]:
#Counting/normalizing cost categories across age groups
age_vs_cost = pd.crosstab(train_file.age_group, train_file.cost_category, normalize='columns')

In [38]:
#getting mean cost_category across ages
age_cost_mean = train_file.groupby('age_group', as_index=False).cost_category.mean()

In [39]:
#descriptive stats based on age groups
descriptive_age = train_file.groupby('age_group').cost_category.describe()

In [40]:
#correlations on attributes over 0.5 grouped by age groups
age_pos_corr = train_file.groupby('age_group')[['package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                                'total_packages', 'cost_category']].corr()['cost_category']

In [41]:
#Counting/normalizing cost categories across travel group type
travelers_vs_cost = pd.crosstab(train_file.travel_with, train_file.cost_category, normalize='columns')

In [42]:
#getting mean cost_category across travel group type
travelers_cost_mean = train_file.groupby('travel_with', as_index=False).cost_category.mean()

In [43]:
#descriptive stats based on travel group type
descriptive_travelers = train_file.groupby('travel_with').cost_category.describe()

In [44]:
#correlations on attributes over 0.5 grouped by travel group type
travelers_pos_corr = train_file.groupby('travel_with')[['package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                                'total_packages', 'cost_category']].corr()['cost_category']

In [45]:
#Counting/normalizing cost categories across travel purpose
purpose_vs_cost = pd.crosstab(train_file.purpose, train_file.cost_category, normalize='columns')

In [46]:
#getting mean cost_category across purpose types
purpose_cost_mean = train_file.groupby('purpose', as_index=False).cost_category.mean()

In [47]:
#descriptive stats based on purpose type
descriptive_purpose = train_file.groupby('purpose').cost_category.describe()

In [48]:
#correlations on attributes over 0.5 grouped by purpose type
purpose_pos_corr = train_file.groupby('purpose')[['package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                                'total_packages', 'cost_category']].corr()['cost_category']

In [49]:
#Counting/normalizing cost categories across main activities
mainact_vs_cost = pd.crosstab(train_file.main_activity, train_file.cost_category, normalize='columns')

In [50]:
#getting mean cost_category across mainact types
mainact_cost_mean = train_file.groupby('main_activity', as_index=False).cost_category.mean()

In [51]:
#descriptive stats based on main act type
descriptive_mainact = train_file.groupby('main_activity').cost_category.describe()

In [52]:
#correlations on attributes over 0.5 grouped by main act type
mainact_pos_corr = train_file.groupby('main_activity')[['package_transport_int', 'package_accomodation', 'package_food', 'package_transport_tz',
                                'total_packages', 'cost_category']].corr()['cost_category']