# Import modules

In [None]:
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_regression
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# Data define

In [None]:
# load datasets
def load_data(file_path):
    return pd.read_csv(file_path)

# examine any duplicated rows and remove it 
def clean_id_and_target(data, id_col, target):
    data = data.drop(id_col, axis=1)
    data = data[data[target] > 0]
    return data

# handle missing values
def remove_null(data):
    return data.dropna()

# target distribution plot: box plot and bar chart
def target_plot(data, target):
    plt.figure(figsize=(8,6))
    plt.subplot(121)
    sns.boxplot(data.target)
    plt.subplot(122)
    sns.distplot(data.target, bins=20)
    plt.show()
    
# merge train df and test df, key = same column in 2 different tables
def merge_df(df1, df2, key):
    return pd.merge(df1, df2, on=key, how='inner')

# shuffle dataset
def shuffle_data(data):
    return shuffle(data).reset_index()

# combine defs for a cleaned dataset, key = same col in 2 different tables
def create_clean_df(train_object, target_object, target, key):
    train_df = merge_df(train_object, target_object, key)
    train_df = clean_id_and_target(train_df, key, target)
    train_df = shuffle_data(train_df)
    return train_df



# Data discover

In [None]:
# single categorical column transform into one hot encoding table
def one_hot_encoding(data, col):
    return pd.get_dummies(data[col])

# single categorical column transform into label encoder
def Label_encoder(data, col):
    le = LabelEncoder()
    le.fit(data[col])
    data[col] = le.transform(data[col])
    
# groupby features and compute for mean value
def feature_groups_mean(data, cate_cols, target):
    groups = data.groupby(by=cate_cols)
    groups_df = pd.DataFrame({'groups_mean': groups[target].mean()})
    groups_df.reset_index(inplace=True)
    return groups_df

# merge new feature columns with same column in tables
def merge_df(df1, df2, key, how='left', fillna=False):
    df1 = pd.merge(df1, df2, on=key, how='left')
    if fillna:
        df1.fillna(0, inplace=True)
    return df1

# merge one hot encoding table or new features with main df
def concat_df(df1, df2):
    return pd.concat([df1, df2], axis=1)

# explore MI score for target and features to examine any high score features
# X = entire origin df, y = target column
# need to exclude id cols
def make_mi_scores(X, y):
    X = X.copy()
    for colname in X.select_dtypes(["object", "category"]):
        X[colname], _ = X[colname].factorize()
    # All discrete features should now have integer dtypes
    discrete_features = [pd.api.types.is_integer_dtype(t) for t in X.dtypes]
    mi_scores = mutual_info_regression(X, y, discrete_features=discrete_features, random_state=0)
    mi_scores = pd.Series(mi_scores, name="MI Scores", index=X.columns)
    mi_scores = mi_scores.sort_values(ascending=False)
    return mi_scores

# Data develop

In [None]:
class ModelEvaluation:
    
    def __init__(self, feature):
        self.feature = feature
        self.train_df = feature.new_df
        self.target_col = feature.target_col
        self.input_df = self.create_input_df()
        
    def create_input_df(self):
        return self.train_df.iloc[:, 2:]
    
    def model_tuning_RDsearch(self, model, X, y):
        n_estimators = [int(x) for x in np.linspace(0,100,num=20)]
        max_features = [float(x/(X.shape[0])) for x in np.linspace(0,80, num=20)]
        max_depth = [int(x) for x in np.linspace(0,80, num=20)]
        max_depth.append(None)
        min_samples_split = [int(x) for x in np.linspace(0,100,num=20)]
        random_grid = {'n_estimators':n_estimators, 'max_features':max_features, 'max_depth':max_depth, 'min_samples_split':min_samples_split}
        RD = RandomizedSearchCV(model, param_distributions = random_grid, pre_dispatch=4, cv=2, verbose=1, random_state=1, n_jobs=4)
        RD.fit(X, y)
        return RD.best_params_
    
    def model_tuning_GdsearchCV(self, model, X, y):
        best_params = self.model_tuning_RDsearch(model, X, y)
        n, max_f, max_d, min_s = best_params['n_estimators'], best_params['max_features'], best_params['max_depth'], best_params['min_samples_split']
        grid = {'n_estimators': [n-5, n, n+5], 'max_features': [max_f-1, max_f, max_f+1], 'max_depth': [max_d-10, max_d, max_d+10], 'min_samples_split': [min_s-2, min_s, min_s+2]}
        GD = GridSearchCV(model, param_grid= grid, cv=3, verbose=2, n_jobs=4)
        GD.fit(X, y)
        return GD.best_params_
    
    def train_model(self, model, X, y, n_jobs, mean_mse, cv_std):
        mse = cross_val_score(model, X, y, cv=5, n_jobs=n_jobs, scoring='neg_mean_squared_error')
        mean_mse[model] = -1 *np.mean(mse)
        cv_std[model] = np.std(mse)
        return self._print_summary(model, mean_mse, cv_std)
        
    def _print_summary(self, model, mean_mse, cv_std):
        print('\nModel:\n', model)
        print('Average MSE:\n', mean_mse[model])
        print('Standard deviation during CV:\n', cv_std[model])
        
    def _best_model(self, mean_mse):
        return min(mean_mse, key=mean_mse.get)
    
    def create_feature_importance(self, X, y):
        model = self._best_model(mean_mse)
        model.fit(X, y)
        importances = model.feature_importances_
        feature_importances_df = pd.DataFrame({'features': X.columns, 'importances': importances})
        feature_importances_df.sort_values(by='importances', ascending = False, inplace=True)
        feature_importances_df.set_index('features', inplace=True, drop=True)
        return feature_importances_df
    
    def plot_feature_importance(self):
        feature_importances_df = self.create_feature_importance(X,y)
        feature_importances_df.plot.bar(figsize=(10,8))
        plt.xticks(rotation=45)
        plt.show()