In [1]:
# from: https://www.youtube.com/watch?v=sm5xeKal72I

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('./data.csv')
data.head(3)

Unnamed: 0,id,gender,height,religion,caste,mother_tongue,profession,location,country,age_of_marriage
0,1,female,"5'4""",,others,Telugu,,London,United Kingdom,21.0
1,2,male,"5'7""",Jain,Shwetamber,Gujarati,Doctor / Healthcare Professional,Fairfax- VA,USA,32.0
2,3,male,"5'7""",Hindu,Brahmin,Hindi,Entrepreneurs / Business,Begusarai,India,32.0


In [4]:
data.isna().sum()

id                   0
gender              29
height             118
religion           635
caste              142
mother_tongue      164
profession         330
location           155
country             16
age_of_marriage     19
dtype: int64

In [5]:
# if dropped what is the % nan?
(data.shape[0] - data.dropna().shape[0]) / data.shape[0]

0.24737047136735488

In [6]:
data.dropna(inplace=True)
print(data.shape)
data.head(3)

(1932, 10)


Unnamed: 0,id,gender,height,religion,caste,mother_tongue,profession,location,country,age_of_marriage
1,2,male,"5'7""",Jain,Shwetamber,Gujarati,Doctor / Healthcare Professional,Fairfax- VA,USA,32.0
2,3,male,"5'7""",Hindu,Brahmin,Hindi,Entrepreneurs / Business,Begusarai,India,32.0
3,4,female,"5'0""",Hindu,Thakur,Hindi,Architect,Mumbai,India,30.0


In [7]:
data.caste.value_counts()

 Brahmin                 252
 Kshatriya                70
 Lingayath                65
 Agarwal                  53
 Shwetamber               46
 Patel                    42
 Bhandari                 41
 Vaishnav                 41
 Baniya                   41
 Arora                    41
 Viswabrahmin             40
 Marthoma                 40
 Memon                    40
 Agri                     40
 Ahom                     40
 Kaibarta                 40
 Gursikh                  40
 OBC - Barber/Naayee      40
 Rajput - Lodhi           40
 Born Again               40
 Vanniyar                 40
 Panchal                  40
 Thakur                   40
 Sahu                     40
 Sindhi-Sakkhar           40
 Valmiki                  40
 Bhatia                   40
 Ramdasia                 40
 Baishnab                 40
 Balija                   40
 Roman Catholic           40
 Khatri                   40
 Billava                  40
 Goud                     40
 Kayastha     

In [8]:
from sklearn.preprocessing import LabelEncoder
enc = LabelEncoder()
data.loc[:, ['gender'
             , 'religion'
             , 'caste'
             , 'mother_tongue'
             , 'country'
            ]] = \
data.loc[:, ['gender'
             , 'religion'
             , 'caste'
             , 'mother_tongue'
             , 'country'
            ]].apply(enc.fit_transform)

In [9]:
data.head(3)

Unnamed: 0,id,gender,height,religion,caste,mother_tongue,profession,location,country,age_of_marriage
1,2,1,"5'7""",2,34,6,Doctor / Healthcare Professional,Fairfax- VA,19,32.0
2,3,1,"5'7""",1,14,8,Entrepreneurs / Business,Begusarai,5,32.0
3,4,0,"5'0""",1,36,8,Architect,Mumbai,5,30.0


In [10]:
X = data.loc[:,['gender'
                , 'height'
                , 'religion'
                , 'caste'
                , 'mother_tongue'
                , 'country']]
y = data.age_of_marriage

In [11]:
X.head(3)

Unnamed: 0,gender,height,religion,caste,mother_tongue,country
1,1,"5'7""",2,34,6,19
2,1,"5'7""",1,14,8,5
3,0,"5'0""",1,36,8,5


In [12]:
# convert height string into cm, 30.48 cm = 1 ft
int(X.loc[1, 'height'].split('\'')[0])*30.48

152.4

In [13]:
# convert height string into cm 2.54 cm = 1 in
int(X.loc[1, 'height'].split('\'')[1][0])*2.54

17.78

In [14]:
# or:
int(X.loc[1, 'height'].split('\'')[1].replace('"',''))*2.54

17.78

In [15]:
# combined: 
int(X.loc[1, 'height'].split('\'')[0])*30.48 + \
int(X.loc[1, 'height'].split('\'')[1].replace('"',''))*2.54

170.18

In [16]:
# create a function to change height into cm
def height_cm(h):
    return int(h.split('\'')[0])*30.48 + int(h.split('\'')[1].replace('"',''))*2.54

In [17]:
X['height_cms'] = X.height.apply(height_cm)

In [18]:
X.head(3)

Unnamed: 0,gender,height,religion,caste,mother_tongue,country,height_cms
1,1,"5'7""",2,34,6,19,170.18
2,1,"5'7""",1,14,8,5,170.18
3,0,"5'0""",1,36,8,5,152.4


In [19]:
# drop height column
X.drop('height'
       , inplace=True
       , axis=1
      )
X.head(3)

Unnamed: 0,gender,religion,caste,mother_tongue,country,height_cms
1,1,2,34,6,19,170.18
2,1,1,14,8,5,170.18
3,0,1,36,8,5,152.4


In [20]:
# Train Test Split
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X
                                                    , y
                                                    , test_size = 0.2
                                                    , random_state = 42
                                                   )

In [22]:
# Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators = 80
                              , max_depth = 1
                             )
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [23]:
# Evaluation
from sklearn.metrics import mean_absolute_error, r2_score

In [24]:
print(f'MAE: {mean_absolute_error(y_test, y_pred)}')

MAE: 1.8326303090821803


In [25]:
print(f'R2 Score: {r2_score(y_test, y_pred)}')

R2 Score: 0.06531944322638727


In [26]:
# export model
import joblib
joblib.dump(model
            , "marriage_age_prediction_model.ml"
           )

['marriage_age_prediction_model.ml']