# Salary Predictions Based on Job Descriptions

# Part 1 - DEFINE

### ---- 1 Define the problem ----

    Given the sample training data, train a model to predict the salary for new postings based job description.

In [None]:
#import your libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import string

import matplotlib.pylab as pylab
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (15, 5),
         'axes.labelsize': 'large',
         'axes.titlesize':'large',
         'axes.labelcolor':'#5F4C0B',
         'axes.titlecolor':'#B40404',
         'xtick.labelsize':'large',
         'ytick.labelsize':'large',
         'xtick.color':'red', 
         'ytick.color':'green'}
pylab.rcParams.update(params)

from scipy import stats
import os
import sys
import pathlib

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor,GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV,cross_val_score,KFold
from sklearn.preprocessing import StandardScaler

from matplotlib import pyplot as plt
import seaborn as sns


#your info here
__author__ = "Vijayan Nallasami"
__email__ = "VNallasami@gmail.com"

In [None]:
pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',500)
np.set_printoptions(linewidth=100)

## Part 2 - DISCOVER

### ---- 2 Load the data ----

In [None]:
class fileload_:
    
    def __init__(self,path,files,target,Id):
        
        self.file_path = path
        self.file_dict = files
        self.target_column = target
        self.merge_Id = Id
        self.load_data()
        self.merge_df()
        self.find_feature_type()
        
    def load_data(self):
        
        for name, file in self.file_dict.items():
            
            if name == 'train_feature':
                self.train_features = pd.read_csv(path+file)
            if name == 'train_target':
                self.train_target = pd.read_csv(path+file)
            if name == 'test':
                self.test_features = pd.read_csv(path+file)
                
    def merge_df(self):
        
        self.train = pd.merge(self.train_features,self.train_target,on=self.merge_Id)
        self.train.drop(labels=self.merge_Id,axis=1,inplace=True)
        
    def find_feature_type(self):
        
        self.numerical_features = self.train.select_dtypes(exclude='object').columns
        self.categorical_features = self.train.select_dtypes('object').columns            

### ---- 3 Clean the data ----

In [None]:
class datacleaning_:
    
    def __init__(self,dataframe):
        self.train_df = dataframe.train
        self.target_column = dataframe.target_column
        self.remove_dupes()
        self.remove_invalid_target()
        dataframe.train = self.train_df        
    
    def remove_dupes(self):
        
        print("Duplicate rows : ",len(self.train_df[self.train_df.duplicated(keep='first')]))
        self.train_df.drop_duplicates(keep='first', inplace=True)
        
    def remove_invalid_target(self):
        
        dupes = self.train_df[self.train_df[self.target_column] < 1].index
        self.train_df.drop(dupes,inplace=True)       
        print("No.of rows with invalid target",len(dupes))
        

In [None]:
def cat_ordered_mean_encoding(X,y,target):
    
    X_feature = X.copy()
    y_target = y.copy()
    
    df = pd.concat([X_feature.copy(),y_target.copy()],axis=1)
    
    categorical = X.select_dtypes('O').columns
    
    for col in categorical:
        
        col_mean = df.groupby([col])[target].mean().sort_values(ascending=True).index
        col_dict = {val: key for key, val in enumerate(col_mean,1)}
        
        df[col + '_enc'] = df[col].map(col_dict)
        
    return df.drop(labels=target,axis=1)

In [None]:
def cat_combined_mean_encoding(X,y,target,var_dict):
    
    X_feature = X.copy()
    y_target = y.copy()
    
    df = pd.concat([X_feature.copy(),y_target.copy()],axis=1)
        
    for key in var_dict:
        
        df[key] = df[var_dict[key][0]] + ' ' + df[var_dict[key][1]]
        
        col_mean = df.groupby(key)[target].mean().sort_values(ascending=True).index
        col_dict = {val: key for key, val in enumerate(col_mean,1)}

        df[key] = df[key].map(col_dict)
        
    return df.drop(labels=target,axis=1)

In [None]:
def remove_duplicate_feature(df):
    dup_list = set()
    for i in range(0,len(df.columns)):
        col1 = df.columns[i]
        for col2 in df.columns[i+1:]:
            if df[col1].equals(df[col2]):
#                 print(col1,col2)
                dup_list.add(col2)
                
    df.drop(labels=dup_list,axis=1,inplace=True)
    return df,dup_list

def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr

### ---- 4 Explore the data (EDA) ----

In [None]:
#summarize each feature variable
#summarize the target variable
#look for correlation between each feature and the target
#look for correlation between features

In [None]:
class data_analysis_:
    
    def __init__(self):        
        self.dataframe = dataframe.train
        self.target = dataframe.target_column
        self.numerical_features = dataframe.numerical_features
        self.categorical_features = dataframe.categorical_features
#         self.cardinality_plot()
        
    def explore_data(self, pred=None): 
        obs = self.dataframe.shape[0]
        types = self.dataframe.dtypes
        counts = self.dataframe.apply(lambda x: x.count())
        uniques = self.dataframe.apply(lambda x: [x.unique()])
        nulls = self.dataframe.apply(lambda x: x.isnull().sum())
        distincts = self.dataframe.apply(lambda x: x.unique().shape[0])
        missing_ratio = (self.dataframe.isnull().sum()/ obs) * 100
        skewness = self.dataframe.skew()
        kurtosis = self.dataframe.kurt() 
        
        if pred is None:
            cols = ['Types', 'Count', 'DistinctValues', 'Nulls', 'MissingRatio', 'Skewness', 'Kurtosis']
            self.eda = pd.concat([types, counts, distincts, nulls, missing_ratio,  skewness, kurtosis], axis = 1)

        else:
            corr = self.dataframe.corr()[pred]
            self.eda = pd.concat([types, counts, distincts, nulls, missing_ratio,  skewness, kurtosis, corr], axis = 1, sort=False)
            corr_col = 'corr '  + pred
            cols = ['Types', 'Count', 'DistinctValues', 'Nulls', 'MissingRatio',  'Skewness', 'Kurtosis', corr_col ]
        self.eda.columns = cols
        
        print(self.eda)
        
        for index, value in uniques.items():
            if len(value[0]) < 20:
                print('-'*120)
                print(index, ' : ',value[0])
            else:
                print('-'*120)
                print(index,' : {} unique values'.format(len(value[0])))
        print('-'*120)
                
    def cardinality_plot(self,h,w):
    
        sns.set(rc={'figure.figsize':(w,h)})
        df = self.dataframe[self.categorical_features].nunique().to_frame().reset_index()
        df.columns = ['column','count']
        b = sns.barplot(x="column", y="count", data=df)
        b.axes.set_title("Categorical columns cardinality",fontsize=20)
        b.set_xlabel("Column Name",fontsize=15)
        b.set_ylabel("Count of Categories",fontsize=15)
        b.tick_params(labelsize=15)
        plt.show()
                
    def displot(self,col):
        
        mean = dataframe.train[[col]].mean()
        b = sns.distplot(dataframe.train[[col]],color='red')
        
        title = col + " Distribution"
        
        b.axes.set_title(title.title(),fontsize=20)
        plt.axvline(mean[0],0,1,color='blue')
    
    def linegraph(self,var_list,fig_h=None,fig_w=None):
        
        nrow, ncol = choose_subplot_dimensions(len(var_list))
        axis_arr = generate_axis_array(nrow,ncol)
        fig, axes = plt.subplots(nrow, ncol, figsize=(fig_h, fig_w))        
        
        for ind,col in enumerate(var_list):  
            
            df = self.dataframe.groupby([col])[self.target].mean().sort_values()
            df = df.to_frame().reset_index()  

            ax=axes[axis_arr[ind][0],axis_arr[ind][1]]
            ax.plot(df[self.target],df[col])            
            
            ax.set_xlabel(self.target)
            ax.set_title(string.capwords(col))
            
        if (len(var_list) % 2) != 0:
            print(nrow,ncol,nrow+ncol % 2)
            fig.delaxes(axes[axis_arr[ind+1][0],axis_arr[ind+1][1]])    
            
        plt.tight_layout()        
        
        
    def barplot(self,var_list,fig_h=None,fig_w=None):
        
        nrow, ncol = choose_subplot_dimensions(len(var_list))
        axis_arr = generate_axis_array(nrow,ncol)
        fig, axes = plt.subplots(nrow, ncol, figsize=(fig_h, fig_w))        
        
        for ind,col in enumerate(var_list):  
            
            df = dataframe.train.groupby([col])[self.target].count().to_frame().reset_index()
            df.columns = [col,'count']
            
            ax=axes[axis_arr[ind][0],axis_arr[ind][1]]
            sns.barplot(x="count", y=col, ci=False,  data=df,ax=ax)          
            
#             ax.set_xlabel(self.target)
#             ax.set_title(string.capwords(col))
            
        if (len(var_list) % 2) != 0:
            print(nrow,ncol,nrow+ncol % 2)
            fig.delaxes(axes[axis_arr[ind+1][0],axis_arr[ind+1][1]])    
            
        plt.tight_layout()
        
    
    def barplothue(self,var_x,var_hue,fig_h=12,fig_w=5):
        
        sns.set(rc={'figure.figsize':(fig_h,fig_w)})
        g = sns.barplot(x=var_x, y=self.target, data=self.dataframe, ci=False, hue = var_hue, orient = 'v')
        g.legend(loc='center left', bbox_to_anchor=(1, 0.5), ncol=1)
        

        
    def choose_subplot_dimensions(k):
        if k < 4:
            return k, 1
        elif k < 11:
            return math.ceil(k/2), 2
        else:        
            return math.ceil(k/3), 3

    def generate_axis_array(nrow, ncol):
        axis = []
        for r in range(nrow):
            for c in range(ncol):            
                axis.append([r,c])
        return axis          
                        
    def correlation(self, threshold):
        col_corr = set()  # Set of all the names of correlated columns
        corr_matrix = self.dataframe.corr()
        for i in range(len(corr_matrix.columns)):
            for j in range(i):
                if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                    colname = corr_matrix.columns[i]  # getting the name of column
                    col_corr.add(colname)
        return col_corr    
    

    def diagnostic_plots(self, variable,col=None):
        # function takes a dataframe (df) and
        # the variable of interest as arguments

        # define figure size
        plt.figure(figsize=(16, 4))

        # histogram
        plt.subplot(1, 3, 1)
        sns.distplot(self.dataframe[variable], bins=30,color=col,kde_kws={'bw':0.1})
        plt.title('Histogram')

        # Q-Q plot
        plt.subplot(1, 3, 2)
        stats.probplot(self.dataframe[variable].astype(float), dist="norm", plot=plt)
        plt.ylabel('Variable quantiles')

        # boxplot
        plt.subplot(1, 3, 3)
        sns.boxplot(y=self.dataframe[variable].astype(float),color=col)
        plt.title('Boxplot')

        plt.show()




In [None]:
path = 'data/'
filenames = {'train_feature':'train_features.csv','train_target':'train_salaries.csv','test':'test_features.csv'}
target = 'salary'
Id = 'jobId'

dataframe = fileload_(path,filenames,target,Id)
data_clean = datacleaning_(dataframe)
dataframe.train.shape

In [None]:
dataframe.train.head()

In [None]:
da = data_analysis_()

In [None]:
da.explore_data('salary')

In [None]:
da.cardinality_plot(5,8)

In [None]:
da.displot('salary')

In [None]:
da.diagnostic_plots('salary')

In [None]:
da.linegraph(['jobType', 'degree', 'major', 'industry','yearsExperience','milesFromMetropolis'],15,15)

In [None]:
da.barplot(['jobType', 'degree', 'major', 'industry','yearsExperience','milesFromMetropolis'],15,10)

In [None]:
da.barplothue('degree','major')

In [None]:
da.barplothue('degree','jobType')

In [None]:
da.barplothue('jobType','industry')

In [None]:
da.barplothue('degree','industry')

In [None]:
ax = sns.boxplot(x="degree_major", y="salary", data=pd.concat([X_train,y_train],axis=1), color = '#EE67CF')

In [None]:
X_train = dataframe.train.drop('salary',axis=1)
y_train = dataframe.train.salary

In [None]:
X_train = cat_ordered_mean_encoding(X_train,y_train,'salary')

In [None]:
degree_major = {'degree_major_enc':['degree','major'],'company_industry':['companyId','industry'],'jobTypeIndustry':['jobType','industry']}
X_train = cat_combined_mean_encoding(X_train,y_train,'salary',degree_major)

In [None]:
X_train.shape,y_train.shape

In [None]:
# X_train = X_train.drop(['companyId','jobType','degree','major','industry'],axis=1)
X_train = X_train.drop(['degree_enc','major_enc','industry_enc','milesFromMetropolis'],axis=1)

In [None]:
X_train.head()

In [None]:
train = pd.concat([X_train,y_train],axis=1)

sns.set(rc={'figure.figsize':(16,12)})
pc = train.corr(method ='pearson')
cols = train.columns
ax = sns.heatmap(pc, annot=True,yticklabels=cols,xticklabels=cols,annot_kws={'size': 15})

In [None]:
X_train.head()

In [None]:
disc = EqualFrequencyDiscretiser(q=10, variables = ['degree_major_enc'])
disc.fit(X_train)
X_train = disc.transform(X_train)

In [None]:
diagnostic_plots(X_train, 'degree_major_enc')

In [None]:
diagnostic_plots(X_train, 'milesFromMetropolis')

In [None]:
X_train.degree_major_enc.unique()

In [None]:
from feature_engine import variable_transformers as vt
lt = vt.LogTransformer(variables = ['jobTypeIndustry'])
lt.fit(X_train)
X_train = lt.transform(X_train)

In [None]:
diagnostic_plots(X_train, 'company_industry')

In [None]:
def diagnostic_plots(df, variable):
    # function takes a dataframe (df) and
    # the variable of interest as arguments

    # define figure size
    plt.figure(figsize=(16, 4))

    # histogram
    plt.subplot(1, 3, 1)
    sns.distplot(df[variable], bins=30)
    plt.title('Histogram')

    # Q-Q plot
    plt.subplot(1, 3, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.ylabel('Variable quantiles')

    # boxplot
    plt.subplot(1, 3, 3)
    sns.boxplot(y=df[variable])
    plt.title('Boxplot')

    plt.show()

In [None]:
def diagnostic_plots(df, variable):
    
    # function to plot a histogram and a Q-Q plot
    # side by side, for a certain variable
    
    plt.figure(figsize=(15,6))
    plt.subplot(1, 2, 1)
    df[variable].hist()

    plt.subplot(1, 2, 2)
    stats.probplot(df[variable], dist="norm", plot=plt)

    plt.show()

In [None]:
da.displot('company_industry')

In [None]:
# X_train.groupby(['jobType']).jobType_enc.nunique()
# pd.crosstab(X_train.jobType,X_train.jobType_enc)
# X_train.groupby('companyId').industry.unique().to_dict()

X_train.groupby(['degree','major'])['degree_major_enc'].apply(lambda x: x.unique()).sort_values(ascending=True)

### ---- 5 Establish a baseline ----

In [None]:
#select a reasonable metric (MSE in this case)
#create an extremely simple model and measure its efficacy
#e.g. use "average salary" for each industry as your model and then measure MSE
#during 5-fold cross-validation

### ---- 6 Hypothesize solution ----

In [None]:
#brainstorm 3 models that you think may improve results over the baseline model based
#on your 

Brainstorm 3 models that you think may improve results over the baseline model based on your EDA and explain why they're reasonable solutions here.

Also write down any new features that you think you should try adding to the model based on your EDA, e.g. interaction variables, summary statistics for each group, etc

## Part 3 - DEVELOP

You will cycle through creating features, tuning models, and training/validing models (steps 7-9) until you've reached your efficacy goal

#### Your metric will be MSE and your goal is:
 - <360 for entry-level data science roles
 - <320 for senior data science roles

### ---- 7 Engineer features  ----

In [None]:
#make sure that data is ready for modeling
#create any new features needed to potentially enhance model

In [None]:
class feature_engineering:
    
    def __init__(self):
        self
        
    def fit(self, X, y):
        self.X = X
        self.y = y      

        self.rare_encoding('major', 0.6)         
        self.discretiser()
    
    def transform(self,X, y=None):        

            

            
        return X
    
    def fit_transform(self,X, y):
        
        return self.fit(X,y).transform(X,y=None)
    
    def find_non_rare_labels(self, variable, tolerance):
    
        temp = self.X.groupby([variable])[variable].count() / len(self.X)

        non_rare = [x for x in temp.loc[temp>tolerance].index.values]

        return non_rare
    
    def rare_encoding(self, variable, tolerance):   

        # find the most frequent category
        frequent_cat = self.find_non_rare_labels(variable, tolerance)

        # re-group rare labels
        self.X[variable] = np.where(self.X[variable].isin(frequent_cat), self.X[variable], '2')

    
    def discretiser(self):
        
        disc = EqualWidthDiscretiser(bins=10, variables = ['companyId', 'milesFromMetropolis','yearsExperience'])
        disc.fit(self.X)
        self.X  = disc.transform(self.X)
    
#     def OneHotEncoding_(self):      
        
        
        

In [None]:
# remove constant and quasi constants *********************************************************
from sklearn.feature_selection import VarianceThreshold,mutual_info_regression,SelectKBest,SelectPercentile

var = VarianceThreshold(threshold=0.01)
var.fit(X_train)
X_train = pd.DataFrame(var.transform(X_train))
X_train.columns = columns = [ 'col'+ str(c) for c in range(0,len(X_train.columns))]
print('remove constant and quasi constants',len(X_train.columns)-sum(var.get_support()))

# remove duplicate features *********************************************************

# X_train, duplicate_feature = remove_duplicate_feature(X_train)
# print('remove duplicate features',len(duplicate_feature))

# remove co-related features *********************************************************

corr_features = correlation(X_train, 0.8)
X_train.drop(labels=corr_features,axis=1,inplace=True)
print('remove co-related features',len(corr_features),corr_features)

# # mutual information features *********************************************************
# sel = SelectKBest(mutual_info_regression,k='all')
# sel.fit(X_train,y_train)
# print('select K-30  best features',X_train.columns[sel.get_support()])
# sel_features = X_train.columns[sel.get_support()].to_list()

# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)

In [None]:
X_train

In [None]:
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel

scaler = StandardScaler()
scaler.fit(X_train.fillna(0))
sel_ = SelectFromModel(Lasso(alpha=100))
sel_.fit(, y_train)

# make a list with the selected features and print the outputs
selected_feat = X_train.columns[(sel_.get_support())]

print('total features: {}'.format((X_train.shape[1])))
print('selected features: {}'.format(len(selected_feat)))
print('features with coefficients shrank to zero: {}'.format(np.sum(sel_.estimator_.coef_ == 0)))

In [None]:
selected_feat

In [None]:
X_train.head()

In [None]:
pipe_prc = []

# pipe_prc.append(('num_impute',mdi.MeanMedianImputer(imputation_method='median',variables=numerical)))
# pipe_prc.append(('cat_impute',mdi.CategoricalVariableImputer(imputation_method='missing',variables=categorical)))
# pipe_prc.append(('rare_label',RareLabelCategoricalEncoder(tol=0.06,n_categories=2,replace_with='2',return_object=True)))
# pipe_prc.append(('cat_enc',OrdinalCategoricalEncoder(encoding_method='ordered')))
# pipe_prc.append(('discretiser',EqualFrequencyDiscretiser(q=10, variables=['companyId_enc','milesFromMetropolis','degree_major_enc','company_industry','jobTypeIndustry'], return_object=True)))
# pipe_prc.append(('cat_enc',OrdinalCategoricalEncoder(encoding_method='ordered')))
pipe_prc.append(('cat_enc',OneHotCategoricalEncoder(top_categories=None,drop_last=True)))
pipe_prc.append(('outliers',Winsorizer(distribution='gaussian',tail='both',fold=3)))
pipe_prc.append(('scaler',StandardScaler()))

pipe = Pipeline(pipe_prc)
pipe.fit(X_train,y_train)
X_train = pipe.transform(X_train)

In [None]:
X_train = pd.DataFrame(X_train)

# X_train.drop('companyId',axis=1,inplace=True)

X_train.head()

In [None]:
pre_proc = preprocessing(test,'jobId','salary')
test_id = pre_proc.split_id()
test = fe.transform(test)

In [None]:
corr_features = correlation(X_train, 0.8)
# train_transformed.drop(labels=corr_features,axis=1,inplace=True)
# test_transformed.drop(labels=corr_features,axis=1,inplace=True)
print('remove co-related features',len(corr_features))

In [None]:
from sklearn.feature_selection import RFE,SelectFromModel

sel_ = SelectFromModel(XGBRegressor(n_estimators=100,max_depth=3,learning_rate=0.1,booster='gbtree'))
sel_.fit(X_train, y_train)
selected_feat = X_train.columns[(sel_.get_support())]

In [None]:
from feature_engine import missing_data_imputers as mdi
from feature_engine.categorical_encoders import OneHotCategoricalEncoder,RareLabelCategoricalEncoder,WoERatioCategoricalEncoder
from feature_engine.discretisers import EqualFrequencyDiscretiser, EqualWidthDiscretiser
from feature_engine.outlier_removers import Winsorizer
from sklearn.pipeline import Pipeline

In [None]:
X_train = X_train[selected_feat]

In [None]:
X_train.shape

In [None]:
train_transformed

### ---- 8 Create models ----

In [None]:
#create and tune the models that you brainstormed during part 2

In [None]:
def model_eval(X,y):
    # prepare configuration for cross validation test harness
    seed = 7
    # prepare models
    models = []
    models.append(('LR', LinearRegression()))
    models.append(('RFR', RandomForestRegressor(n_estimators = 10, random_state = 0)))
    #models.append(('DTR', DecisionTreeRegressor(random_state = 0)))
    models.append(('GBR', GradientBoostingRegressor(n_estimators=40, max_depth=7, loss='ls')))
    models.append(('XGB', XGBRegressor(n_estimators=100,max_depth=3,learning_rate=0.1)))
#     # evaluate each model in turn
    results = []
    names = []
    scoring = 'neg_mean_squared_error'
    for name, model in models:
        kfold = KFold(n_splits=5, random_state=seed)
        cv_results = cross_val_score(model, X, y, cv=kfold, scoring=scoring,n_jobs=-1)
        results.append(cv_results)
        names.append(name)        
        msg = "%s: %f (%f)" % (name, -1.0*np.mean(cv_results), cv_results.std())
#         msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
        print(msg)
    # boxplot algorithm comparison
    fig = plt.figure()
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.show()
    return names,results

### ---- 9 Test models ----

In [None]:
#do 5-fold cross validation on models and measure MSE

In [None]:
import warnings
warnings.simplefilter('always')

In [None]:
# scaler = StandardScaler()
# scaler.fit(X_train)
# X_train = scaler.transform(X_train)

In [None]:
model_eval(X_train,y_train)

In [None]:
xgb = XGBRegressor()
param_grid = {
                'n_estimators': [100]
              , 'max_depth': [3]
              , 'learning_rate': [0.1]
              ,'booster' :['gbtree']
              ,'objective' :['reg:linear']
             }

grid = GridSearchCV(xgb, param_grid, cv=5, scoring='neg_mean_squared_error',n_jobs=-1)
grid.fit(X_train,y_train)
# y_pred = grid.predict(train_transformed)

print(grid.best_score_)
print(grid.best_params_)

In [None]:
print("Mean squared error :",metrics.mean_squared_error(y_train, y_pred, squared=False)) 
print("r2_score :",metrics.r2_score(y_train, y_pred))

### ---- 10 Select best model  ----

In [None]:
#select the model with the lowest error as your "prodcuction" model

## Part 4 - DEPLOY

### ---- 11 Automate pipeline ----

In [None]:
#write script that trains model on entire training set, saves model to disk,
#and scores the "test" dataset

### ---- 12 Deploy solution ----

In [None]:
#save your prediction to a csv file or optionally save them as a table in a SQL database
#additionally, you want to save a visualization and summary of your prediction and feature importances
#these visualizations and summaries will be extremely useful to business stakeholders

### ---- 13 Measure efficacy ----

We'll skip this step since we don't have the outcomes for the test data

In [None]:
        if isinstance(y, pd.DataFrame):
            target = y.columns.to_list()
        else:
            target = y.to_frame().columns.to_list()
            
        self.categorical = [col for col in X.select_dtypes('O').columns]
        
        self.mapper = dict()
        
        for col in self.categorical:        
            self.col_mean = pd.concat([X, y],axis=1).groupby(col)[target[0]].mean().sort_values(ascending=True).index
            col_mapper = {k : n for n,k in enumerate(self.col_mean,0)}
            self.mapper.update({col : col_mapper})
            
                for col in self.categorical:              
            X[col] = X[col].map(self.mapper.get(col)) 

In [None]:
lr = LinearRegression()

rfr = RandomForestRegressor(n_estimators=60, n_jobs=-1, max_depth=15, min_samples_split=80, \
                                       max_features=8)

gboost = GradientBoostingRegressor(n_estimators=40, max_depth=7, loss='ls')

models = [lr, rfr, gboost]

model_output = ModelEvaluation(pd.concat([X_train,y_train],axis=1), models)

In [None]:
job_dict = {'JANITOR':8,'JUNIOR':7,'SENIOR':6,'MANAGER':5,'VICE_PRESIDENT':4,'CTO':3,'CFO':2,'CEO':1}

train['jobType'] = train['jobType'].map(job_dict)

train.jobType.unique()

In [None]:
train.degree.unique()

degree_dict = {'DOCTORAL':1,'MASTERS':2,'BACHELORS':3,'HIGH_SCHOOL':4,'NONE':5}

train.degree = train.degree.map(degree_dict)

train.degree.unique()