### Training a Machine Learning model for regression problem 

In [None]:
# If you'd like to install packages that aren't installed by default, uncomment the last two lines of this cell and replace <package list> with a list of your packages.
# This will ensure your notebook has all the dependencies and works everywhere

#import sys
#!{sys.executable} -m pip install <package list>

In [None]:
#Libraries
import pandas as pd
import numpy as np
import missingno as mno
import seaborn as sns
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import LabelBinarizer, Normalizer
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_log_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

warnings.filterwarnings('ignore')
pd.set_option("display.max_columns", 101)

## Data Description

Column | Description
:---|:---
`id` | Record index
`timestamp` | Datetime (YYYY:MM:DD HH:MM:SS) when data was collected
`country` | Current country of employment
`employment_status` | Whether a candidate is Full time, Part time, Independent or freelancer or company owner
`job_title` | Current job title of the candidate
`job_years` | Total job experience (in Years)
`is_manager` | Whether the candidate holds a managerial position or not (Yes or No)
`hours_per_week` | No. of hours per day committed to the current job
`telecommute_days_per_week` | No. of telecommuting days per week (working from home)
`education` | The highest degree in education the candidate has received
`is_education_computer_related` | Is the education related to the field of computer science (Yes or No)
`certifications` | Does the candidate have any relevant certifications (Yes or No)
`salary` | Monthly Salary (in US $$)

## Data Wrangling & Visualization

In [None]:
# Dataset is already loaded below
data = pd.read_csv("train.csv")

In [None]:
# Dimensions of training data
data.shape

In [None]:
# Print first few rows of data
data.head()

In [None]:
# Explore columns
data.columns

In [None]:
# Description
data.describe()

In [None]:
# Info
data.info()

In [None]:
# Check Datatypes
data.dtypes

In [None]:
# Print total missing values in each column
data.isnull().sum()

In [None]:
## Check the distribution for the null values
mno.matrix(data, figsize = (20, 6))

In [None]:
# replace NANs in hours_per_week with median value of the column  
data.loc[data['hours_per_week'].isna(), 'hours_per_week'] = data['hours_per_week'].median()
data.loc[data['telecommute_days_per_week'].isna(), 'telecommute_days_per_week'] = data['telecommute_days_per_week'].median()

##### Next We have the some missing values in is_education_computer_related column which is a categorical variable. So we can't just impute these missing values. Best way to deal with it is to remove all rows for which this column has a nan.

In [None]:
# drop missing vals
data = data.dropna()

In [None]:
# now check total missing vals in every column
data.isnull().sum()

In [None]:
# Confirm that all missing values are gone.
mno.matrix(data, figsize = (20, 6))

## Visualization, Modeling, Machine Learning

Build a model that can predict salary and identify how different features influence their decision? Please explain your findings effectively to technical and non-technical audiences using comments and visualizations, if appropriate.
- **Build an optimized model that effectively solves the business problem.**
- **The model would be evaluated on the basis of mean absolute error.**
- **Read the test.csv file and prepare features for testing.**

In [None]:
# Loading Test data
test = pd.read_csv('test.csv')
test.shape

In [None]:
# Print first couple of rows of test data
test.head()

In [None]:
# check if there are any missing vals in test data or not
test.isnull().sum()

In [None]:
# impute missing vals in hours_per_week column in test data with median value 
test.loc[test['hours_per_week'].isna(), 'hours_per_week'] = test['hours_per_week'].median()

In [None]:
# Confirm that all missing values are gone in test data.
mno.matrix(test, figsize = (20, 6))

In [None]:
# let's encode the categorical features in data only for analysis

train_data = data.copy()

cols = ['employment_status', 'job_title', 'is_manager', 'certifications', 
        'education', 'is_education_computer_related']

for c in cols: # traverse each column
    for i, item in enumerate(train_data[c].unique().tolist()): 
      ## for a column create traverse all unique values in it using 'item'
        train_data.loc[train_data[c] == item, c] = i

    print("Actual values in column:", c, "\n",  data[c].unique().tolist(), '\n')
    print("Encoded values in column:", c, "\n", train_data[c].unique().tolist(), '\n')

In [None]:
train_data.columns

### Visualizations

In [None]:
# joint plots for numeric variables

cols = ["job_years", "hours_per_week"]
for c in cols:
    sns.jointplot(x=c, y="salary", data=data, kind = 'reg', height = 5)
plt.show()

##### From the plots above we can clearly see that job_years has a relation with salary. The more job_years means more monthly earning(salary). Similar trend is visible for hours_per_week variable in relation to the target variable (salary).

In [None]:
# dist plots for numeric variables
cols = ["job_years", "hours_per_week"]
for c in cols:
    sns.distplot(data[c])
    plt.grid()
    plt.show()

##### The distributions plotted above are interesting. For job_years, we see a slightly skewed distribution which shows that majority of professionals in our data have less than 10 years of total job experience. For hours_per_week we see a somewhat bimdal distribution showing that most people work from 40 to 45 hours per week.

In [None]:
# distribution of target variable
sns.distplot(data['salary'])
plt.grid()
plt.title('Distribution of Target Variable in Data')
plt.show()
print('max:', np.max(data['salary']))
print('min:', np.min(data['salary']))

##### For target variable i.e. salary we can see a smooth normal distribution with a bulge at the mean salary point.

In [None]:
# bar plots for categorical features
cols = ['employment_status', 'job_title', 'is_manager', 'certifications', 
        'telecommute_days_per_week', 'education']

fig, axes = plt.subplots(3, 2, figsize=(16, 16))

for i, c in enumerate(cols):
    ax = axes.ravel()[i]
    sns.barplot(x=c, y="salary", ax=ax, data=train_data)

In [None]:
# print actual values for encoded labels of a column against which salary is highest
print(data['employment_status'].unique().tolist()[1])
print(data['job_title'].unique().tolist()[11])
print(data['is_manager'].unique().tolist()[0])
print(data['certifications'].unique().tolist()[1])
print(data['telecommute_days_per_week'].unique().tolist()[5])
print(data['education'].unique().tolist()[0])

#####  We see the following trends from above plots:
1. People with employment_status = 1 i.e. 'Independent consultant, contractor, freelancer,  or company owner' earn more than full and half time employees. 
2. People with job_title Sr Consultant earn more than other professionals. 
3. We also see that people who hold manegrial positions earn more than those who don't.
4. People with certifications have little difference in monthly salary than those who don't. 
5. For people having 3 or more telecommute_days_per_week, have higher salaries which might suggest that these people are actually independent contractors or freelancers. 
6. As for education, the trend shows taht people with a Bachelors degree of 4 years earn more as compared to others. 

In [None]:
train_data.columns

In [None]:
plt.figure(figsize = (15,10))
sns.heatmap(train_data.corr(), cmap="CMRmap", annot=True)
plt.show()

##### The correlation matrix above shows that there is no correlation among the features.

### Feature Encoding and Normalization

Before training the model, we should perform one-hot encoding for all categorical/discrete variables, normalize continuous variables and then combine all data to form the training set.

In [None]:
# create another copy of dataset and append encoded features to it
data_train = data.copy()

data_train.head()

In [None]:
# drop id, timestamp and country columns
data_train = data_train.drop(columns=['id', 'timestamp'])

In [None]:
# select categorical features
cat_cols = [c for c in data_train.columns if data_train[c].dtype == 'object' 
            and c not in ['is_manager', 'certifications']]
cat_data = data_train[cat_cols]
cat_cols


In [None]:
data_train.shape

In [None]:
# encode binary variables
binary_cols = ['is_manager', 'certifications']
for c in binary_cols:
    data_train[c] = data_train[c].replace(to_replace=['Yes'], value=1)
    data_train[c] = data_train[c].replace(to_replace=['No'], value=0)

In [None]:
final_data = pd.get_dummies(data_train, columns=cat_cols, drop_first= True)

final_data.shape

In [None]:
final_data.head()

In [None]:
data_train.columns

In [None]:
# # adding remaining cols
# for c in data_train.columns:
#     final_data[c] = data_train[c].values

# print(final_data.shape)

In [None]:
# select numerical features
num_cols = [c for c in data_train.columns if c not in cat_cols and c not in binary_cols and c != 'salary']
num_cols

In [None]:
# Apply standard scaling on numeric data 
scaler = StandardScaler()
scaler.fit(final_data[num_cols])
final_data[num_cols] = scaler.transform(final_data[num_cols])



**The management wants to know the most important features for the model.**

> #### Task:
- **Visualize the top 20 features and their feature importance.**


### Train Test Split

In [None]:
y = final_data['salary']
X = final_data.drop(columns=['salary'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("Training Set Dimensions:", X_train.shape)
print("Validation Set Dimensions:", X_test.shape)

In [None]:
X_train

### Model Training

In [None]:
# train random forest regression model
randomf = RandomForestRegressor()

randomf.fit(X_train, y_train)

In [None]:
print('MAPE for train set:', np.mean(np.abs((y_train - randomf.predict(X_train))) / y_train) * 100)
print('MAPE for validation set:', np.mean(np.abs((y_test - randomf.predict(X_test))) / y_test) * 100)

In [None]:
# compute feature importance from random forest regression model
feature_imp=pd.DataFrame()
for feature,imp in zip(X_train.columns,randomf.feature_importances_):
    temp=pd.DataFrame([feature,imp]).T
    feature_imp=feature_imp.append(temp)
feature_imp.columns=['feature','relative_importance']
feature_imp.sort_values(by='relative_importance',inplace=True)
feature_imp.set_index('feature',inplace=True)
feature_imp.iloc[-20:,:].plot(kind='barh',figsize=(10,8))
plt.show()

> #### Task:
- **Submit the predictions on the test dataset using the optimized model** <br/>
    For each record in the test set (`test.csv`), predict the value of the `salary` variable. Submit a CSV file with a header row and one row per test entry. 

The file (`submissions.csv`) should have exactly 2 columns:
   - **id**
   - **salary**

### Encode and Normalize features of Test Data

In [None]:
# store ids and drop column
test_data = test.copy()
ids = test_data['id']
test_data = test_data.drop(columns=['id', 'timestamp'])

In [None]:
# encode binary variables
binary_cols = ['is_manager', 'certifications']
for c in binary_cols:
    test_data[c] = test_data[c].replace(to_replace=['Yes'], value=1)
    test_data[c] = test_data[c].replace(to_replace=['No'], value=0)

test_data.shape

In [None]:
encoded_test_data = pd.get_dummies(test_data, columns=cat_cols, drop_first= True)

encoded_test_data.shape

In [None]:
# # adding remaining cols
# for c in test_data.columns:
#     encoded_test_data[c] = test_data[c].values

# print(encoded_test_data.shape)
# print(encoded_test_data.isnull().values.any())

In [None]:
# standardize test data
encoded_test_data[num_cols] = scaler.transform(encoded_test_data[num_cols])
encoded_test_data