### Packages

In [272]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

### Load Files

In [320]:
# Load Training Data
train_data_new = pd.read_csv("Data_Sets/training_set_VU_DM.csv")

### Get overview of missing data per feature

In [275]:
table = []
for i in train_data_new.columns:
    total_null = train_data_new[i].isnull().sum()
    table.append([i, total_null, 100*total_null/len(train_data_new[i])])

In [276]:
missing_values_overview = pd.DataFrame(table, columns = ["Feature Name", "Total Null", "% Null"])
missing_values_overview.sort_values("% Null",ascending = False) # arrange features based on % Null 

Unnamed: 0,Feature Name,Total Null,% Null
29,comp1_rate_percent_diff,4863908,98.095353
44,comp6_rate_percent_diff,4862173,98.060362
27,comp1_rate,4838417,97.58125
28,comp1_inv,4828788,97.387053
38,comp4_rate_percent_diff,4827261,97.356256
52,gross_bookings_usd,4819957,97.208949
47,comp7_rate_percent_diff,4819832,97.206428
42,comp6_rate,4718190,95.156511
4,visitor_hist_starrating,4706481,94.920364
5,visitor_hist_adr_usd,4705359,94.897735


### Filling in the missing values for "prop_review_score"

In [334]:
train_data_new.loc[:,"prop_review_score"][train_data_new.loc[:,"prop_review_score"].isnull()] = train_data_new.loc[:,"prop_review_score"].median()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


### Filling in the missing values for "prop_location_score2"

In [335]:
temp_df = train_data_new.loc[:,["prop_id","prop_location_score2"]][train_data_new.loc[:,"prop_location_score2"].notnull()].groupby(["prop_id"]).max()
dict_ids = temp_df.to_dict()['prop_location_score2'] # this is a list of property id's and the mode prop_location_score2
train_data_new["prop_location_score2"][train_data_new["prop_location_score2"].isnull()] = train_data_new["prop_id"].map(dict_ids)
train_data_new["prop_location_score2"][train_data_new["prop_location_score2"].isnull()] = train_data_new["prop_location_score2"].median()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


### Filling in the missing values for "orig_destination_distance"

In [336]:
temp_df = train_data_new[["visitor_location_country_id","prop_country_id","orig_destination_distance"]][train_data_new["orig_destination_distance"].notnull()]
temp_df['new_col'] = list(zip(temp_df["visitor_location_country_id"], temp_df["prop_country_id"]))
org_dest_dict = temp_df.groupby(["new_col"]).median()["orig_destination_distance"].to_dict()

temp_nan = train_data_new[["visitor_location_country_id","prop_country_id","orig_destination_distance"]][train_data_new["orig_destination_distance"].isnull()]
temp_nan['new_col'] = list(zip(temp_nan["visitor_location_country_id"], temp_nan["prop_country_id"]))
if not temp_nan["new_col"].map(org_dest_dict).empty:
    train_data_new["orig_destination_distance"] = temp_nan["new_col"].map(org_dest_dict)
    train_data_new["orig_destination_distance"][train_data_new["orig_destination_distance"].isnull()] = train_data_new["orig_destination_distance"].median()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


### Filling in the missing values for "srch_query_affinity_score"

In [337]:
dict_scores = train_data_new[["prop_id","srch_query_affinity_score"]][train_data_new["srch_query_affinity_score"].notnull()].groupby(["prop_id"]).median().to_dict()["srch_query_affinity_score"]

# search for past scores based on prop_id
train_data_new["srch_query_affinity_score"][train_data_new["srch_query_affinity_score"].isnull()] = train_data_new["prop_id"].map(dict_scores)

# for all remaining values, use the minimum affinity score
train_data_new["srch_query_affinity_score"][train_data_new["srch_query_affinity_score"].isnull()] = train_data_new["srch_query_affinity_score"][train_data_new["srch_query_affinity_score"].notnull()].min()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


### Filling in competition rate

In [338]:
competitors = [5]
for i in competitors:
    feature = f"comp{i}_rate"
    dict_rates = train_data_new[["prop_id",feature]][train_data_new[feature].notnull()].groupby(["prop_id"]).max().to_dict()[feature]
    train_data_new[feature][train_data_new[feature].isnull()] = train_data_new["prop_id"].map(dict_rates)
    train_data_new[feature][train_data_new[feature].isnull()] = train_data_new[feature][train_data_new[feature].notnull()].mode().iloc[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Filling in competition availability

In [339]:
competitors = [5]
for i in competitors:
    feature = f"comp{i}_inv"
    dict_avail = train_data_new[["prop_id",feature]][train_data_new[feature].notnull()].groupby(["prop_id"]).max().to_dict()[feature]
    train_data_new[feature][train_data_new[feature].isnull()] = train_data_new["prop_id"].map(dict_avail)
    train_data_new[feature][train_data_new[feature].isnull()] = train_data_new[feature][train_data_new[feature].notnull()].mode().iloc[0]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


### Get overview of missing data per feature after data has been inserted

In [340]:
post_table = []
for i in train_data_new.columns:
    total_null = train_data_new[i].isnull().sum()
    post_table.append([i, total_null, 100*total_null/len(train_data_new[i])])

In [341]:
post_values_overview = pd.DataFrame(post_table, columns = ["Feature Name", "Total Null", "% Null"])
post_values_overview.sort_values("% Null",ascending = False) # arrange features based on % Null 

Unnamed: 0,Feature Name,Total Null,% Null
29,comp1_rate_percent_diff,4863908,98.095353
44,comp6_rate_percent_diff,4862173,98.060362
27,comp1_rate,4838417,97.58125
28,comp1_inv,4828788,97.387053
38,comp4_rate_percent_diff,4827261,97.356256
52,gross_bookings_usd,4819957,97.208949
47,comp7_rate_percent_diff,4819832,97.206428
42,comp6_rate,4718190,95.156511
4,visitor_hist_starrating,4706481,94.920364
5,visitor_hist_adr_usd,4705359,94.897735
