In [54]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns


In [55]:
PATH = Path('../data')

In [56]:
train_data = pd.read_parquet(PATH / 'train_data.parquet')

submission_data = pd.read_parquet(PATH / 'submission_data.parquet')

In [57]:
#novo ficheiro para passar categoricas/ordinal.
#remover as colunas que não interessam (muito relacionadas)
#tratar dos valores vazios/outliers
#feature engineering


#check the feature importance

### Check null values



In [58]:
train_data.isnull().sum()
#give a look at the null values compared to the total number of rows
train_data.isnull().sum() / train_data.shape[0]

brand            0.000000
phase            0.000000
country          0.000000
dayweek          0.000000
month            0.000000
wd_perc          0.000000
ther_area        0.126736
hospital_rate    0.179554
n_nwd_bef        0.000000
n_nwd_aft        0.000000
n_weekday_0      0.000000
n_weekday_1      0.000000
n_weekday_2      0.000000
n_weekday_3      0.000000
n_weekday_4      0.000000
date             0.000000
wd               0.000000
wd_left          0.000000
monthly          0.000000
main_channel     0.179554
dtype: float64

In [59]:
submission_data.isnull().sum() / submission_data.shape[0]

country          0.000000
brand            0.000000
dayweek          0.000000
month            0.000000
wd_perc          0.000000
ther_area        0.109807
hospital_rate    0.109807
n_nwd_bef        0.000000
n_nwd_aft        0.000000
n_weekday_0      0.000000
n_weekday_1      0.000000
n_weekday_2      0.000000
n_weekday_3      0.000000
n_weekday_4      0.000000
date             0.000000
wd               0.000000
wd_left          0.000000
main_channel     0.109807
dtype: float64

In [60]:
#drop the rows with hospital_rate null values
train_data = train_data.dropna(subset=['hospital_rate'])
train_data.isnull().sum() / train_data.shape[0]


brand            0.0
phase            0.0
country          0.0
dayweek          0.0
month            0.0
wd_perc          0.0
ther_area        0.0
hospital_rate    0.0
n_nwd_bef        0.0
n_nwd_aft        0.0
n_weekday_0      0.0
n_weekday_1      0.0
n_weekday_2      0.0
n_weekday_3      0.0
n_weekday_4      0.0
date             0.0
wd               0.0
wd_left          0.0
monthly          0.0
main_channel     0.0
dtype: float64

In [61]:
# from sklearn.feature_selection import SelectKBest
# from sklearn.feature_selection import f_classif

# def calculate_anova(df, features, target, k="all"):
#   # apply selectKBest to select top features
#   best_features = SelectKBest(score_func=f_classif, k=k)
#   df_cut = df[features]
#   fit = best_features.fit(df_cut, df[target])
#   dfscores = pd.DataFrame(fit.scores_)
#   dfcolumns = pd.DataFrame(df_cut.columns)

#   feature_score = pd.concat([dfcolumns, dfscores], axis=1)
#   feature_score.columns = ["Features", "Score"]
#   ret = feature_score.sort_values(by="Score", ascending=False)
#   return ret.head(k) if k != "all" else ret

# anova_features = ['hospital_rate', 'monthly', 'n_weekday_0','n_weekday_1', 'n_weekday_2', 'n_weekday_3', 'n_weekday_4', 'n_nwd_bef', 'n_nwd_aft'];

# calculate_anova(train_data, features=anova_features, target='phase')

In [62]:
def process_data(df):

    df['brand'] = df['brand'].astype('category')
    df['country'] = df['country'].astype('category')

    df['dayweek_sin'] = np.sin(df['dayweek']*2*np.pi/7)
    df['dayweek_cos'] = np.cos(df['dayweek']*2*np.pi/7)
    df= df.drop(columns=['dayweek'])

    df['month_sin'] = np.sin(df['month']*2*np.pi/12)
    df['month_cos'] = np.cos(df['month']*2*np.pi/12)
    df= df.drop(columns=['month'])

    df['wd'] = df['wd'].astype('int64')
    df['n_nwd_bef'] = df['n_nwd_bef'].astype('int64')
    df['n_nwd_aft'] = df['n_nwd_aft'].astype('int64')

    df['ther_area'] = np.where(df['ther_area'].isnull(), 'unknown', df['ther_area'])
    df['main_channel'] = np.where(df['main_channel'].isnull(), 'unknown', df['main_channel'])
    df['hospital_rate'] = np.where(df['hospital_rate'].isnull(), df['hospital_rate'].median(), df['hospital_rate'])

    # drop features
    columns_to_drop = ['wd_left', 'wd_perc' ]

    df = df.drop(columns=columns_to_drop)

    return df

new_train_data = process_data(train_data)
new_submission_data = process_data(submission_data)

In [63]:
new_train_data.info()
new_train_data.isna().sum()

new_submission_data.info()
new_submission_data.isna().sum()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2307496 entries, 5090 to 3524651
Data columns (total 20 columns):
 #   Column         Dtype         
---  ------         -----         
 0   brand          category      
 1   phase          float64       
 2   country        category      
 3   ther_area      object        
 4   hospital_rate  float64       
 5   n_nwd_bef      int64         
 6   n_nwd_aft      int64         
 7   n_weekday_0    int64         
 8   n_weekday_1    int64         
 9   n_weekday_2    int64         
 10  n_weekday_3    int64         
 11  n_weekday_4    int64         
 12  date           datetime64[ns]
 13  wd             int64         
 14  monthly        float64       
 15  main_channel   object        
 16  dayweek_sin    float64       
 17  dayweek_cos    float64       
 18  month_sin      float64       
 19  month_cos      float64       
dtypes: category(2), datetime64[ns](1), float64(7), int64(8), object(2)
memory usage: 341.1+ MB
<class 'pandas.cor

country          0
brand            0
ther_area        0
hospital_rate    0
n_nwd_bef        0
n_nwd_aft        0
n_weekday_0      0
n_weekday_1      0
n_weekday_2      0
n_weekday_3      0
n_weekday_4      0
date             0
wd               0
main_channel     0
dayweek_sin      0
dayweek_cos      0
month_sin        0
month_cos        0
dtype: int64

In [64]:
new_train_data.to_csv(PATH / 'new_train_data.csv', index=False)
new_submission_data.to_csv(PATH / 'new_submission_data.csv', index=False)