#### Data Preparation
#### Machine Learning Modelling
#### Model Evaluation
#### Export Trained Model
#### Local rest API with Flask Web Server or using AWS and Flask.

In [23]:
import pandas as pd
data=pd.read_csv('age_of_marriage_data.csv')
data.head()

Unnamed: 0,id,gender,height,religion,caste,mother_tongue,profession,location,country,age_of_marriage
0,1,female,"5'4""",,others,Telugu,,London,United Kingdom,21.0
1,2,male,"5'7""",Jain,Shwetamber,Gujarati,Doctor / Healthcare Professional,Fairfax- VA,USA,32.0
2,3,male,"5'7""",Hindu,Brahmin,Hindi,Entrepreneurs / Business,Begusarai,India,32.0
3,4,female,"5'0""",Hindu,Thakur,Hindi,Architect,Mumbai,India,30.0
4,5,male,"5'5""",Christian,Born Again,Malayalam,Sales Professional / Marketing,Sulthan Bathery,India,30.0


In [24]:
data.isna().sum()

id                   0
gender              29
height             118
religion           635
caste              142
mother_tongue      164
profession         330
location           155
country             16
age_of_marriage     19
dtype: int64

In [25]:
data.shape

(2567, 10)

24% of records are null

In [26]:
(data.shape[0]-data.dropna().shape[0])/data.shape[0]

0.24737047136735488

In [27]:
data.dropna(inplace=True)

In [28]:
print(data.shape)
data.head()

(1932, 10)


Unnamed: 0,id,gender,height,religion,caste,mother_tongue,profession,location,country,age_of_marriage
1,2,male,"5'7""",Jain,Shwetamber,Gujarati,Doctor / Healthcare Professional,Fairfax- VA,USA,32.0
2,3,male,"5'7""",Hindu,Brahmin,Hindi,Entrepreneurs / Business,Begusarai,India,32.0
3,4,female,"5'0""",Hindu,Thakur,Hindi,Architect,Mumbai,India,30.0
4,5,male,"5'5""",Christian,Born Again,Malayalam,Sales Professional / Marketing,Sulthan Bathery,India,30.0
5,6,male,"5'5""",Hindu,Valmiki,Hindi,Sportsman,Delhi,India,29.0


we have only two numerical attributes and it shows id does not have influence on dependant variable
age_of_marriage.
normally a strong correlation is expected when it is [0.7,1.0] or [-0.7,-1]

In [29]:
data.corr()

Unnamed: 0,id,age_of_marriage
id,1.0,0.00682
age_of_marriage,0.00682,1.0


profession,location have high cardinality(large unique values) and does not contribute to prediction of age_of_marriage dependant variable

In [30]:
X=data.loc[:,['gender','height','religion','caste','mother_tongue','country']]
y=data.age_of_marriage

# convert the string columns to numerical attributes using label encoding
##### label encoding is for nominal variables where as one hot encoding is for ordinal variables
##### for e.g.
##### ordinal variable values: e.g. say 
##### High
##### Low
##### Medium 
##### for e.g. 
##### categorical nominal variables: e.g. country
##### India
##### Japan
##### Australia
##### USA
##### UK


In [33]:
from sklearn.preprocessing import LabelEncoder
enc=LabelEncoder()
X.loc[:,['gender','religion','caste','mother_tongue','country']] = \
X.loc[:,['gender','religion','caste','mother_tongue','country']].apply(enc.fit_transform)

In [34]:
X.head()

Unnamed: 0,gender,height,religion,caste,mother_tongue,country
1,1,"5'7""",2,34,6,19
2,1,"5'7""",1,14,8,5
3,0,"5'0""",1,36,8,5
4,1,"5'5""",0,13,13,5
5,1,"5'5""",1,38,8,5


height is not numerical field and convert this to cms.

In [35]:
def h_cms(h):
    return int(h.split('\'')[0])*30.48 + int(h.split('\'')[1].replace('"',''))*2.54

In [40]:
X["height_cms"]=X.height.apply(h_cms)
X.drop('height',inplace=True,axis=1)

In [41]:
X.head()

Unnamed: 0,gender,religion,caste,mother_tongue,country,height_cms
1,1,2,34,6,19,170.18
2,1,1,14,8,5,170.18
3,0,1,36,8,5,152.4
4,1,0,13,13,5,165.1
5,1,1,38,8,5,165.1


In [43]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

RandomForestRegressor for predicting a continuos variable and RandomForestClassifier for predicting a classification/discrete
variable

In [53]:
from sklearn.ensemble import RandomForestRegressor
model=RandomForestRegressor(n_estimators=80,max_depth=11)
model.fit(X_train,y_train)
y_predict=model.predict(X_test)

mae,rmse,mse and r2_score are for continuos variables.
confusion_matrix is used for classification/discrete variables

In [54]:
from sklearn.metrics import mean_absolute_error,r2_score
print("MAE :", mean_absolute_error(y_test,y_predict))
r2_score(y_test,y_predict)

MAE : 1.0265293839836345


0.7012576989465316

## IMPROVE THE MODEL USING RANDOMSEARCHCV and GRIDSEARCHCV

##### n_estimators = number of trees in the foreset
##### max_features = max number of features considered for splitting a node
##### max_depth = max number of levels in each decision tree
##### min_samples_split = min number of data points placed in a node before the node is split
##### min_samples_leaf = min number of data points allowed in a leaf node
##### bootstrap = method for sampling data points (with or without replacement)

In [59]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 20, stop = 150, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [20, 34, 48, 63, 77, 92, 106, 121, 135, 150], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


## RANDOM SEARCH TRAINING

In [60]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   19.8s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:   30.4s finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [61]:
# print best params
rf_random.best_params_

{'n_estimators': 48,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

In [62]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape
    print('Model Performance')
    print('Average Error: {:0.4f} .'.format(np.mean(errors)))
    print('Accuracy = {:0.2f}%.'.format(accuracy))
    
    return accuracy


In [63]:
base_model = RandomForestRegressor(n_estimators = 80,max_depth=11, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Model Performance
Average Error: 1.0306 .
Accuracy = 96.54%.


In [64]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Model Performance
Average Error: 1.0015 .
Accuracy = 96.63%.


In [65]:
print('Improvement of {:0.2f}%.'.format( 100 * (random_accuracy - base_accuracy) / base_accuracy))

Improvement of 0.10%.


## Fine Tune with Grid Search with Cross Validation

In [67]:
from sklearn.model_selection import GridSearchCV
# Create the parameter grid based on the results of random search 
param_grid = {
    'bootstrap': [True],
    'max_depth': [None, 10, 12, 15,20],
    'max_features': ['auto'],
    'min_samples_leaf': [1, 2],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [30, 40, 50, 60,70,80]
}
# Create a based model
rf = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                          cv = 3, n_jobs = -1, verbose = 2)

In [68]:
# Fit the grid search to the data
grid_search.fit(X_train, y_train)
grid_search.best_params_


Fitting 3 folds for each of 180 candidates, totalling 540 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.6s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:   23.2s
[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed:   30.8s finished


{'bootstrap': True,
 'max_depth': None,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 10,
 'n_estimators': 80}

In [69]:
best_grid = grid_search.best_estimator_
grid_accuracy = evaluate(best_grid, X_test, y_test)

Model Performance
Average Error: 1.0088 .
Accuracy = 96.61%.


In [70]:
print('Improvement of {:0.2f}%.'.format( 100 * (grid_accuracy - base_accuracy) / base_accuracy))

Improvement of 0.07%.


redefine the model based on the best parameters from random Search and grid search

best params from random search
{'n_estimators': 48,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

In [76]:

model=grid_search.best_estimator_

### EXPORT THE MODEL

In [77]:
from sklearn.externals import joblib
joblib.dump(model,'marriage_age_predic_model.ml')

['marriage_age_predic_model.ml']

# references
https://towardsdatascience.com/hyperparameter-tuning-the-random-forest-in-python-using-scikit-learn-28d2aa77dd74