# A) Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('train_v2.csv',sep=';',encoding='latin')

In [3]:
df_test = pd.read_csv('test_v2.csv',sep=';',encoding='latin')

In [4]:
# No records with identical Id
len(df_train.Id.unique()) == len(df_train)

True

In [5]:
df_train.count()

Id                    10000
Title                  9999
FullDescription       10000
LocationNormalized    10000
ContractType           3556
ContractTime           4737
Company                5951
Category              10000
SalaryNormalized      10000
SourceName            10000
dtype: int64

In [6]:
# No records with identical Id
len(df_test.Id.unique()) == len(df_test)

True

In [7]:
df_test.count()

Id                    5000
Title                 5000
FullDescription       5000
LocationNormalized    5000
ContractType           808
ContractTime          4188
Company               4709
Category              5000
SourceName            5000
dtype: int64

In [8]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
Id                    10000 non-null int64
Title                 9999 non-null object
FullDescription       10000 non-null object
LocationNormalized    10000 non-null object
ContractType          3556 non-null object
ContractTime          4737 non-null object
Company               5951 non-null object
Category              10000 non-null object
SalaryNormalized      10000 non-null int64
SourceName            10000 non-null object
dtypes: int64(2), object(8)
memory usage: 781.3+ KB


### The FullDescription is the a kind of unformated concatenation of the other features ==> We will drop it

In [9]:
print(df_train.FullDescription.head(2),'\n')
print(df_train.Title.head(2))

0    Engineering Systems Analyst Dorking Surrey Sal...
1    Stress Engineer Glasgow Salary **** to **** We...
Name: FullDescription, dtype: object 

0    Engineering Systems Analyst
1        Stress Engineer Glasgow
Name: Title, dtype: object


# 1. Drop Id, storing SalaryNormalized and FullDescription
- Dropping Id which is useless for the prediction purpose
- Storing the SalaryNormalized features which is the target feature (y)
- Storing FullDescription, we will remove it from the training data because it is a unformatted merge of the other features
- Storing the Id of the test data

In [10]:
full_descript_tr = df_train.FullDescription
Salary = df_train.SalaryNormalized
SalaryLog = np.log(Salary)
df_train.drop(['Id','FullDescription','SalaryNormalized'],axis=1,inplace=True)
test_Id = df_test.Id
df_test.drop(['Id','FullDescription'],axis=1,inplace=True)

# 2. Merging the test and train data 
- Let us merge the training and test data in to a global dataframe **df**
- Hence, we will get the same levels for binning and dummyfication purposes

In [11]:
df = pd.concat([df_train,df_test],axis=0)
df = df.reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 7 columns):
Title                 14999 non-null object
LocationNormalized    15000 non-null object
ContractType          4364 non-null object
ContractTime          8925 non-null object
Company               10660 non-null object
Category              15000 non-null object
SourceName            15000 non-null object
dtypes: object(7)
memory usage: 820.4+ KB


** We can observe that there are some missing values on the following features:**
- Title
- ContractType
- ContractTime
- Company

# 3. Title Feature
- Only one missing value: Index 1588

In [12]:
title_missing_index = df[df.Title.isnull()].index[0]
title_missing_index

1588

In [13]:
df.loc[title_missing_index]

Title                                       NaN
LocationNormalized                    Liverpool
ContractType                          full_time
ContractTime                                NaN
Company                                     NaN
Category              Healthcare & Nursing Jobs
SourceName                       careworx.co.uk
Name: 1588, dtype: object

In [14]:
full_descript_missing_title = full_descript_tr[title_missing_index]
full_descript_missing_title

'Quality Improvement Manager North West England Specialist Services ****  **** Our client, a leading provider of Supported Living and Residential Services across the UK require a Quality Improvement Manager for the North West. The successful candidate will provide a consistent and systematic approach to improve outcomes, service user experience and safety in all services across the business. This will require delivering quality standards that comply with and exceed those specified by statutory, regulatory bodies and contractual requirements. The salary on offer is ****  **** For more information or to apply please call Dave Langford at Compass Associates on **** **** **** or email dlangfordcompassltd.co.uk'

** As seen previously the first words of FullDescription feature corresponds usually the "Title", hence we will replace the missing data by the three first words of the full description. **

In [15]:
title_words = full_descript_missing_title.split(' ')[:3]
missing_title = ' '.join(title_words)
missing_title

'Quality Improvement Manager'

In [16]:
df.loc[title_missing_index,'Title'] = missing_title

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Data columns (total 7 columns):
Title                 15000 non-null object
LocationNormalized    15000 non-null object
ContractType          4364 non-null object
ContractTime          8925 non-null object
Company               10660 non-null object
Category              15000 non-null object
SourceName            15000 non-null object
dtypes: object(7)
memory usage: 820.4+ KB


## 3.1. Bag of words of titles

** Checking the number of levels of the Title feature (different titles) **

In [18]:
print(len(df.Title.unique()))

12052


- There are two many levels. So let us create a Bag of words of this features.
- Indeed, this strategy could also be applied to the FullDescription features, however we decided to apply to the Title feature

In [19]:
title_bag_of_words = [i for i in df.Title]

In [20]:
import re
letters_only = [ re.sub("[^a-zA-Z]", " ", w ).strip() for w in title_bag_of_words]
lower_case = [l.lower().strip() for l in letters_only]
title_bag_of_words = [re.sub(' +',' ',l) for l in lower_case] # remove extra blank

In [21]:
title_bag_of_words[:5]

['engineering systems analyst',
 'stress engineer glasgow',
 'modelling and simulation analyst',
 'engineering systems analyst mathematical modeller',
 'pioneer miser engineering systems analyst']

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

ngramrange = (1,5) # The range of the N-grams

M_feat = 1200 # Maximal number of features, this is actually a tunning parameter
tfidf_transformer = TfidfVectorizer(lowercase=True, 
                                    max_features = M_feat, 
                                    stop_words='english', 
                                    ngram_range=ngramrange)

tfidf_transformer.fit(title_bag_of_words)

tfidf_df = tfidf_transformer.transform(title_bag_of_words)
tfidf_df = tfidf_df.toarray()
tfidf_df = pd.DataFrame(tfidf_df)

tfidf_df.columns = tfidf_transformer.get_feature_names()
tfidf_df.head(3)

Unnamed: 0,aa,aa rosette,aberdeen,account,account director,account executive,account handler,account manager,accountant,accounts,...,worker jobs,workers,workshop,writer,year,year primary,year primary teacher,year teacher,years,yorkshire
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# LSA
# from sklearn.decomposition import TruncatedSVD
# from sklearn.pipeline import make_pipeline
# from sklearn.preprocessing import Normalizer

# N_Components = 50
# svd = TruncatedSVD(N_Components)
# normalizer = Normalizer(copy=False)
# lsa = make_pipeline(svd, normalizer)

# tfidf_df = lsa.fit_transform(tfidf_df)
# tfidf_df = pd.DataFrame(tfidf_df)
# column_topic = ['c'+str(i) for i in range(N_Components)]
# tfidf_df.column_topic = column_topic


** Let us concatenate the tfidf dataframe to the initial dataframe and drop the column "Title" **

In [24]:
df = pd.concat([df,tfidf_df],axis=1)
df.drop('Title',axis=1,inplace=1)
df.shape

(15000, 1206)

# 4. Handling the missing values of ContractTypes,ContractTime and Company Features
- Since, there are large number of missing values, we decided to not drop the corresponding rows
- We have two options:
    - just replace by the mode (categorical features)
    - create a new category "NotAvailable" for the missing values



In [25]:
def handlingMissingData_Mode(df,column_name):
    df[column_name].fillna(df[column_name].mode()[0], inplace=True)
    return df

def handlingMissingData_NotAv(df,column_name):
    df[column_name].fillna('NotAvailable', inplace=True)
    return df

** Finally, we used the first technique of missing value imputation because we got better results with it **

In [26]:
columns_with_miss_val = ['ContractType','ContractTime','Company','Category']
for c in columns_with_miss_val:
    handlingMissingData_Mode(df,c)

In [27]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Columns: 1206 entries, LocationNormalized to yorkshire
dtypes: float64(1200), object(6)
memory usage: 138.0+ MB


# 5. Useful Functions
### These functions will be used to re-encode the categorical features and to dummify them

**a) Function which re-encodes the values which are not in the top "n" to "Else"**

In [28]:
def reEncodingValues(x,top_n):
    if x not in top_n:
        x = 'Else'
    return x

** b) Function finding the "n" most frequent values in a data frame **

In [29]:
def top_n_values(df,c,n):
    top_n_loc = df_train[c].value_counts()[:n]
    top_n_loc = top_n_loc.keys().tolist()
    return top_n_loc

**c) Functions which dummifies the columns and drops the initial columns**

In [30]:
def dummyFunction(df,c):
    c_dummies = pd.get_dummies(df[c],prefix=c)
    df = pd.concat([df, c_dummies], axis=1)
    df_out = df.drop(c,axis=1)
    return df_out

# 6. ContractType and ContratTime Features

In [31]:
print(df.ContractType.value_counts())
print()
print(df.ContractTime.value_counts())

full_time    14372
part_time      628
Name: ContractType, dtype: int64

permanent    13717
contract      1283
Name: ContractTime, dtype: int64


** For these two features we used just a dummyfication **

In [32]:
list_column = ['ContractType','ContractTime']
for c in list_column:
    df = dummyFunction(df,c)

In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15000 entries, 0 to 14999
Columns: 1208 entries, LocationNormalized to ContractTime_permanent
dtypes: float64(1200), object(4), uint8(4)
memory usage: 137.8+ MB


# 7. Location Feature

In [34]:
#Number of uniques values
len(df.LocationNormalized.unique())

1025

In [35]:
df.LocationNormalized.value_counts().head(10)

UK                   2515
London               1730
Manchester            363
The City              335
Leeds                 298
Belfast               240
South East London     215
Birmingham            202
Surrey                182
Bristol               171
Name: LocationNormalized, dtype: int64

In [36]:
N = 10
c = 'LocationNormalized'
top_N_locations = top_n_values(df.LocationNormalized,c,N)
top_N_locations

['UK',
 'London',
 'Manchester',
 'Leeds',
 'Belfast',
 'Birmingham',
 'The City',
 'Surrey',
 'Sheffield',
 'Hampshire']

In [37]:
df_train[c].head(5)

0      Dorking
1      Glasgow
2    Hampshire
3       Surrey
4       Surrey
Name: LocationNormalized, dtype: object

In [38]:
df[c] = df[c].apply(lambda x:reEncodingValues(x,top_N_locations)) 
df[c].head(5)

0         Else
1         Else
2    Hampshire
3       Surrey
4       Surrey
Name: LocationNormalized, dtype: object

In [39]:
df = dummyFunction(df,c)

# 8. Category, SourceName Company features

In [40]:
df.Category.value_counts().head(10)

Healthcare & Nursing Jobs      3412
IT Jobs                        2289
Engineering Jobs               2163
Accounting & Finance Jobs       999
Sales Jobs                      811
HR & Recruitment Jobs           727
Hospitality & Catering Jobs     659
Teaching Jobs                   549
Other/General Jobs              431
Customer Services Jobs          377
Name: Category, dtype: int64

In [41]:
df.SourceName.value_counts().head(10)

careworx.co.uk           2991
jobsite.co.uk            2240
MyUkJobs                 1559
cv-library.co.uk         1110
totaljobs.com             796
theitjobboard.co.uk       643
fish4.co.uk               441
planetrecruit.com         436
hays.co.uk                384
thecareerengineer.com     312
Name: SourceName, dtype: int64

In [42]:
df.Company.value_counts().head(10)

JOBG8                  4817
Jobsite Jobs            282
Fresh Partnership       262
ARRAY                   222
UKStaffsearch           197
Chef Results            146
Clear Selection         132
Triumph Consultants      99
JHR                      75
Castle Recruitment       74
Name: Company, dtype: int64

** According to the distribution of the different values and also based on CV results we will:**

- take the **top 5 for Category**

- **we will drop SourceName and Company features**

## 8.1. Category

In [43]:
N = 5
c = 'Category'
top_N_Category = top_n_values(df,c,N)
df[c] = df[c].apply(lambda x:reEncodingValues(x,top_N_Category)) 
df = dummyFunction(df,c)

## 8.2. SourceName and Company

In [44]:
#N = 4
#c = 'SourceName'
#top_N_Sources = top_n_values(df,c,N)
#df[c] = df[c].apply(lambda x:reEncodingValues(x,top_N_Sources)) 
#df = dummyFunction(df,c)

In [45]:
#N = 7
#c = 'Company'
#top_N_Company = top_n_values(df,c,N)
#df[c] = df[c].apply(lambda x:reEncodingValues(x,top_N_Company)) 
#df = dummyFunction(df,c)

## 8.3. Dropping SourceName and Company Features
- Actually, we got some better results without these two features

In [46]:
column_to_drop = ['Company','SourceName']
df.drop(column_to_drop,axis=1,inplace=True)

# B) Modelling

In [47]:
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.ensemble import RandomForestRegressor



In [48]:
size_train = len(df_train)
size_test = len(df_test)
print(size_train)
size_test

10000


5000

In [49]:
train_data = df.loc[0:size_train-1,]
test_data = df.loc[size_train:,]

print('train size:',train_data.shape)
print('test size:',test_data.shape)


train size: (10000, 1221)
test size: (5000, 1221)


In [50]:
X_train, X_test, y_train, y_test = train_test_split(train_data, SalaryLog, test_size=0.3,  random_state=42)

# 1. Random Forest

## 1.2 Grid search - Cross Validation

In [51]:
rf_model = RandomForestRegressor(n_estimators=100, 
                                 bootstrap = True,
                                 random_state=0,
                                 verbose = 1,
                                 n_jobs=-1)

parameters = {'max_depth':[None],
    'min_samples_split': [5,10,15],
    'max_features': ['sqrt']
}

clf = GridSearchCV(rf_model, 
                  parameters,
                  cv=5,
                  verbose=1,
                    
                  scoring='neg_mean_squared_error')

clf.fit(X_train, y_train)
rf_reg = clf.best_estimator_
print(rf_reg)
np.mean(abs(rf_reg.predict(X_test)-y_test))

Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.1s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.3s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.5s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.5s
[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.3s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jo

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=5, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=0,
           verbose=1, warm_start=False)


[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    1.5s finished
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 100 out of 100 | elapsed:    0.0s finished


0.20213626170820018

# 1.3. Training on the global training data set

**Let us train the best estimator with the entire training data set and using a larger number of trees (the higher the better in the case of Random Forest)**

In [53]:
rf_reg = RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=5, min_weight_fraction_leaf=0.0,
           n_estimators=10000, n_jobs=-1, oob_score=False, random_state=0,
           verbose=1, warm_start=False).fit(train_data,SalaryLog)

[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    0.8s
[Parallel(n_jobs=-1)]: Done 184 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 434 tasks      | elapsed:    9.1s
[Parallel(n_jobs=-1)]: Done 784 tasks      | elapsed:   16.5s
[Parallel(n_jobs=-1)]: Done 1234 tasks      | elapsed:   26.7s
[Parallel(n_jobs=-1)]: Done 1784 tasks      | elapsed:   45.7s
[Parallel(n_jobs=-1)]: Done 2434 tasks      | elapsed:  1.0min
[Parallel(n_jobs=-1)]: Done 3184 tasks      | elapsed:  1.3min
[Parallel(n_jobs=-1)]: Done 4034 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 4984 tasks      | elapsed:  2.0min
[Parallel(n_jobs=-1)]: Done 6034 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 7184 tasks      | elapsed:  3.0min
[Parallel(n_jobs=-1)]: Done 8434 tasks      | elapsed:  3.5min
[Parallel(n_jobs=-1)]: Done 9784 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 10000 out of 10000 | elapsed:  4.1min finished


# 1.4. Prediction

In [54]:
y_pred = rf_reg.predict(test_data)

[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    0.0s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:    0.1s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:    0.3s
[Parallel(n_jobs=8)]: Done 784 tasks      | elapsed:    0.6s
[Parallel(n_jobs=8)]: Done 1234 tasks      | elapsed:    0.9s
[Parallel(n_jobs=8)]: Done 1784 tasks      | elapsed:    1.3s
[Parallel(n_jobs=8)]: Done 2434 tasks      | elapsed:    1.9s
[Parallel(n_jobs=8)]: Done 3184 tasks      | elapsed:    2.4s
[Parallel(n_jobs=8)]: Done 4034 tasks      | elapsed:    3.0s
[Parallel(n_jobs=8)]: Done 4984 tasks      | elapsed:    3.7s
[Parallel(n_jobs=8)]: Done 6034 tasks      | elapsed:    4.4s
[Parallel(n_jobs=8)]: Done 7184 tasks      | elapsed:    5.2s
[Parallel(n_jobs=8)]: Done 8434 tasks      | elapsed:    6.1s
[Parallel(n_jobs=8)]: Done 9784 tasks      | elapsed:    7.1s
[Parallel(n_jobs=8)]: Done 10000 out of 10000 | elapsed:    7.2s finished


In [55]:
y_pred = np.exp(y_pred)
y_pred

array([ 29232.44201215,  34834.73884053,  48071.18564824, ...,
        34434.07662364,  29670.81655565,  32258.34319432])

In [56]:
!rm -rf result_rf.csv

In [57]:
result = pd.DataFrame({'Id':test_Id,'PredictedSalary':y_pred})
result.to_csv('result_rf.csv',index=False)

### Kaggle score: 9546.54523	

# 1.5. Features importance
- Useful (features seletion tool) for further study 

In [None]:
import matplotlib.pyplot as plt
importances = rf_reg.feature_importances_
indices = np.argsort(importances)[::-1]
std = np.std([tree.feature_importances_ for tree in rf_reg.estimators_],axis=0)
fig, ax = plt.subplots(1)
ax.bar(range(test_data.shape[1]), importances[indices],color="k", yerr=std[indices], align="center")
plt.xticks(range(train_data.shape[1]), train_data.columns[indices])
plt.xlim([-1, train_data.shape[1]])
ax.set_xticklabels(ax.xaxis.get_majorticklabels(), rotation=90)
plt.title('Features importance')
plt.show();


### Actually we started directly by the Random Forest Regressor. However, we could start by a simpler model like LASSO. Hence, Let us experiment this regressor.

# 2. Lasso Regression

# 2.1. Grid search - Cross Validation

In [416]:
from sklearn import grid_search
from sklearn.linear_model import Lasso
parameters = {'alpha':[0.5,  1.0, 5]}

model = Lasso(random_state=0) # we don't specify alpha directly, we'll specify multiple alphas to try below

gs = grid_search.GridSearchCV( model, 
                                parameters,
                                cv=5,
                                verbose=2,
                                scoring='neg_mean_absolute_error',
                                n_jobs=-1)
gs.fit(X_train, y_train)
lasso_model = gs.best_estimator_
np.mean(abs(lasso_model.predict(X_test)-y_test))

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] alpha=0.5 .......................................................
[CV] alpha=0.5 .......................................................
[CV] alpha=0.5 .......................................................
[CV] alpha=0.5 .......................................................
[CV] alpha=0.5 .......................................................
[CV] alpha=1.0 .......................................................
[CV] alpha=1.0 .......................................................
[CV] alpha=1.0 .......................................................
[CV] .............................................. alpha=1.0 - 1.0min
[CV] alpha=1.0 .......................................................
[CV] .............................................. alpha=0.5 - 1.2min
[CV] alpha=1.0 .......................................................
[CV] .............................................. alpha=0.5 - 1.3min
[CV] alpha=5 ....

[Parallel(n_jobs=-1)]: Done   8 out of  15 | elapsed:  1.6min remaining:  1.4min


[CV] ................................................ alpha=5 -  11.7s
[CV] ................................................ alpha=5 -  12.7s
[CV] .............................................. alpha=0.5 - 1.7min
[CV] .............................................. alpha=0.5 - 1.9min
[CV] .............................................. alpha=1.0 -  51.0s
[CV] .............................................. alpha=0.5 - 1.9min
[CV] .............................................. alpha=1.0 -  44.1s


[Parallel(n_jobs=-1)]: Done  15 out of  15 | elapsed:  2.0min finished


7566.371814228326

In [417]:
lasso_model

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False)

# 2.2. Train on the global training data set

** Let us train with the global train data **

In [418]:
Lasso_final = Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
   normalize=False, positive=False, precompute=False, random_state=0,
   selection='cyclic', tol=0.0001, warm_start=False).fit(train_data,Salary)

# 2.3. Prediction 

In [419]:
y_pred = Lasso_final.predict(test_data)

In [420]:
!rm -rf result_lasso.csv
result = pd.DataFrame({'Id':test_Id,'PredictedSalary':y_pred})
result.to_csv('result_lasso.csv',index=False)

## Kaggle Lasso = 9812.98369

In [58]:
import xgboost as xgb
xgb_model = xgb.XGBRegressor()
clf = GridSearchCV(xgb_model,
                   {'max_depth': [2,4,6],
                    'n_estimators': [50,100,200,500]},
                   n_jobs = -1,
                   verbose=2)

clf.fit(X_train,y_train)
clf.best_score_, clf.best_params_

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV] max_depth=2, n_estimators=50 ....................................
[CV] max_depth=2, n_estimators=50 ....................................
[CV] max_depth=2, n_estimators=50 ....................................
[CV] max_depth=2, n_estimators=100 ...................................
[CV] max_depth=2, n_estimators=100 ...................................
[CV] max_depth=2, n_estimators=100 ...................................
[CV] max_depth=2, n_estimators=200 ...................................
[CV] max_depth=2, n_estimators=200 ...................................
[CV] ........................... max_depth=2, n_estimators=50 -   9.4s
[CV] max_depth=2, n_estimators=200 ...................................
[CV] ........................... max_depth=2, n_estimators=50 -   9.8s
[CV] max_depth=2, n_estimators=500 ...................................
[CV] ........................... max_depth=2, n_estimators=50 -  10.4s
[CV] max_depth=2

[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  8.4min finished


(0.584089124131479, {'max_depth': 6, 'n_estimators': 500})

In [60]:
clf.best_estimator_

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [55]:
xgbreg = clf.best_estimator_
print(xgbreg)
np.mean(abs(xgbreg.predict(X_test)-y_test))

XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)


7107.593250325521

In [61]:
xgbreg = xgb.XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=6,
       min_child_weight=1, missing=None, n_estimators=500, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1).fit(train_data,SalaryLog)

In [62]:
y_pred = xgbreg.predict(test_data)
result = pd.DataFrame({'Id':test_Id,'PredictedSalary':np.exp(y_pred)})
result.to_csv('result_xgb.csv',index=False)

# Kaggle: 9339.03711