In [None]:
#my info here
__author__ = "Tomo Shimo"
__email__ = "tomo.krmr@gmail.com"

In [6]:
#import the libraries we use
import pandas as pd
import sklearn as sk
import numpy as np

In [None]:
import os

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
from sklearn.pipeline import make_pipeline

In [None]:
import time

In [None]:
from sklearn.model_selection import GridSearchCV

ModuleNotFoundError: No module named 'src'

In [16]:
# some function for data loading

def load_data(path, filename):
    filepath = os.path.join(path, filename)
    return pd.read_csv(filepath)

def check_df(df):
    # NaN 
    print('# of NaN')
    print(df.isna().sum())
    # Duplicate
    print('\n # of Duplicate')
    print(df.duplicated().sum())

In [18]:
# some function for EDA

def cat_boxplot(df, target, feature):
    df_feature_mean = df.groupby([feature],as_index=False).mean()
    df_feature_sorted = df_feature_mean.sort_values(by=target)
    plt.figure()
    sns.boxplot(x=feature, y=target, data=df, order=df_feature_sorted[feature].values)
    plt.xticks(rotation=45)

4:43: E231 missing whitespace after ','
7:80: E501 line too long (86 > 79 characters)


In [None]:
# some function for Modeling

def cv_5(model, features, target):
    # 5-fold cross validation
    Rcross = sk.model_selection.cross_val_score(model,features,target, cv=5, scoring='neg_mean_squared_error')
    print(model)
    print('Mean: '+str(- Rcross.mean())+', Std: '+str(Rcross.std()))

## ---- 1 Define the problem ----

This is the salary prediction project; we predict the salary based on the job descriptions. We are going to make a model. If we obtain a new job description, we can predict the salary for this new job with using this model.

## ---- 2 Load the data ----

In [None]:
csv_path = '~/Documents/Study/DSDJ/Instructions_and_Notebook/data/'
df_features = load_data(csv_path, 'train_features.csv')

In [None]:
df_features.head(10)

In [None]:
df_features.info()

In [None]:
df_salaries = load_data(csv_path, 'train_salaries.csv')

In [None]:
df_salaries.head(10)

In [None]:
df_salaries.info()

If we find the mismatch between data type and actual data, we'll modify it here.

## ---- 3 Clean the data ---
We look for duplicate data, invalid data, or corrupt data and remove it

In [None]:
check_df(df_features)

In [None]:
check_df(df_salaries)

Here, salaries <= 0 are invalid.

In [None]:
df_salaries.query('salary <= 0')

In [None]:
df_salaries_mod = df_salaries.query('salary > 0')

Here, we marge features dataframe and salary dataframe.
JobId is the index for these data.

In [None]:
df = pd.merge(df_salaries_mod, df_features, how="inner" ,on="jobId")

## ---- 4 Explore the data (EDA) ----

In [None]:
# Before starting EDA, we are going to set some useful variables.
# num_features: features have numerical value
# cat_features: features have categorical value
# target: target value

num_features = ['yearsExperience', 'milesFromMetropolis']
cat_features = ['companyId', 'jobType', 'degree', 'major', 'industry']
target = 'salary'

### Numerical features

In [None]:
df.describe()

In [None]:
# See how is the distribution of target value
sns.distplot(df[target], 100,"salary", "Distribution of salary")

We can see a long tail on the right side but it looks like a regular distribution.

In [None]:
# See how is the correlation between target and numerical features
from pandas.plotting import scatter_matrix
attributes = target[:]
attributes.extend(num_features)
scatter_matrix(df[attributes], figsize=(12,8))

In [None]:
df.corr()

From these analyses, salary is **positively correlated** with years of experience and **negatively correlated** with miles from metropolis.

### Categorical features
Next, we are going to see how the categorical features affect to the salary.

In [None]:
df.describe(include='O') # 'O' means Object

In [None]:
for feature in cat_features:
    cat_boxplot(df, target, feature)

From these analyses, salary is **correlated** with jobType, degree, major and industry, but looks like there's **little** correlation with companyId.

## ---- 5 Establish a baseline ---
Aiming MSE < 360  
At first, the salary considered to have a linear relationship with the years of experience.

In [None]:
from sklearn.linear_model import LinearRegression
lm = LinearRegression()
lm.fit(df[['yearsExperience']],df['salary'])
print( "Salary = " + str(lm.coef_[0]) + " * yearsExperience + " + str(lm.intercept_))

In [None]:
cv_5(lm,df[['yearsExperience']],df['salary'] )

The MSE (Mean Squared Error) is 1288.

## ---- 6 Hypothesize solution ----

## ---- 7 Engineer features  ----

Here, we use all the features without companyId.

In [None]:
cat_features.remove('companyId')

In [None]:
cat_features

In [None]:
# Transform categorical features into one-hot valiables

df_num = df[num_features]
df_cat = df[cat_features]
df_ = df_num.join(pd.get_dummies(df_cat))

In [None]:
df_

## ---- 8 Create models ----
We are using linear regression, random forest regression and gradient boosting.

### Linear Regression

In [None]:
lr = LinearRegression()
lr.fit(df_,df[target])

In [None]:
cv_5(lr,df_,df[target] )

### RandomForest

In [None]:
from sklearn.ensemble import RandomForestRegressor
forest_reg = RandomForestRegressor()
cv_5(forest_reg,df_,df[target] )

In [None]:
start = time.time()
param_grid = [
    {'n_estimators': [10,30, 100]}
]
grid_search = GridSearchCV(forest_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(df_, df[target])
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")
grid_search.best_params_

In [None]:
start = time.time()
forest_reg = RandomForestRegressor(n_estimators=150, n_jobs=-1)
cv_5(forest_reg,df_,df[target] )
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
gb_reg = GradientBoostingRegressor()
cv_5(gb_reg,df_,df[target] )

In [None]:
start = time.time()
param_grid = [
    {'n_estimators': [30, 100]}
]
grid_search = GridSearchCV(gb_reg, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(df_, df[target])
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

In [None]:
grid_search.best_params_

In [None]:
start = time.time()
gb_reg = GradientBoostingRegressor(n_estimators=200)
cv_5(gb_reg,df_,df[target] )
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

In [None]:
start = time.time()
gb_reg = GradientBoostingRegressor(n_estimators=1000)
cv_5(gb_reg,df_,df[target] )
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")

In [None]:
# Plot feature importance
gb_reg_200 = GradientBoostingRegressor(n_estimators=200)
gb_reg_200.fit(df_,df[target] )
feature_importance = gb_reg_200.feature_importances_

In [None]:
# make importances relative to max importance
feature_importance = 100.0 * (feature_importance / feature_importance.max())
sorted_idx = np.argsort(feature_importance)
pos = np.arange(sorted_idx.shape[0]) + .5
plt.figure(figsize=(6, 10))
plt.barh(pos, feature_importance[sorted_idx], align='center')
plt.yticks(pos, df_.columns[sorted_idx], rotation=30)
plt.tight_layout()
plt.xlabel('Relative Importance')
plt.title('Variable Importance')
plt.show()