In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn import metrics
from sklearn import linear_model
from sklearn.cross_validation import train_test_split

%matplotlib inline

In [2]:
""" loading for Rm data"""
rm = pd.read_csv('C:/vikas/data/ML/rm_training_01.csv')

In [4]:
""" Filling up the NAN Value in Rm dataset """
rm['last_job_type'].fillna(0,inplace=True)
rm['last_job_industry'].fillna(0,inplace=True)
rm.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 818 entries, 0 to 817
Data columns (total 10 columns):
employee_id                818 non-null int64
number_of_jobs             818 non-null int64
last_job_type              818 non-null object
last_job_industry          818 non-null object
experience_in_dbs          818 non-null int64
experience_outside_dbs     818 non-null int64
experience_in_fin          818 non-null int64
experience_in_insurance    818 non-null int64
target_achievement         818 non-null object
overall_performance        818 non-null int64
dtypes: int64(7), object(3)
memory usage: 70.3+ KB


In [104]:
""" loading of loan Training data"""
loan = pd.read_csv('C:/vikas/data/ML/loan_training_01.csv')
loan = loan[['rm_employee_id','Loan Status']]

""" Renaming the columns name to match with RM dataset """
loan.rename(columns = {'rm_employee_id':'employee_id'},inplace = True)

""" Find the count of Good and Bad loan """
loan = loan.groupby(['employee_id','Loan Status'])['Loan Status'].count().unstack()
loan.reset_index(inplace=True)
loan.rename(columns = {0:'bad',1:'good'},inplace = True)
loan['good'].fillna(0,inplace=True)
loan['bad'].fillna(0,inplace=True)

""" Fetching the Success Rate """
loan['Success_rate'] = loan['good']/(loan['bad'] + loan['good'])
loan['Success_rate'] = loan['Success_rate'].apply(lambda x: round(x,3))
loan.head()

Loan Status,employee_id,bad,good,Success_rate
0,1,4,32,0.889
1,5,3,23,0.885
2,9,2,28,0.933
3,12,7,20,0.741
4,15,4,21,0.84


In [105]:
""" Join both loan with rm dataset """
def fix_job(string):
    if string != 0:
        return 1
    else :
        return 0

def target(string):
    if string == 'MEDIUM' :
        return 1
    elif string == 'HIGH' :
        return 2
    else :
        return 0
    
loan_rm = pd.merge(rm,loan, on='employee_id', how='left')
loan_rm['target_achievement'] = loan_rm['target_achievement'].apply(target)
loan_rm['last_job_type'] = loan_rm['last_job_type'].apply(fix_job)
loan_rm['last_job_industry'] = loan_rm['last_job_industry'].apply(fix_job)
loan_rm.head()

Unnamed: 0,employee_id,number_of_jobs,last_job_type,last_job_industry,experience_in_dbs,experience_outside_dbs,experience_in_fin,experience_in_insurance,target_achievement,overall_performance,bad,good,Success_rate
0,1,0,0,0,8,0,8,0,1,1,4,32,0.889
1,5,0,0,0,4,0,4,0,1,1,3,23,0.885
2,9,0,0,0,9,0,9,0,1,1,2,28,0.933
3,12,5,1,1,9,7,13,3,1,2,7,20,0.741
4,15,0,0,0,4,0,4,0,1,1,4,21,0.84


In [106]:
""" Creating the Training and test dataset from the final loan and rm """
X = loan_rm.drop('overall_performance',axis=1)
y = loan_rm['overall_performance']
X_train,X_test, y_train,y_test = train_test_split(X, y, test_size=0.40, random_state=101)

In [107]:
# importing th model
from sklearn.linear_model import LogisticRegression

logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [108]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# now predict the output on training dataset
predict = logmodel.predict(X_train)
print(classification_report(y_train,predict))

             precision    recall  f1-score   support

          0       0.91      0.24      0.38        42
          1       0.87      0.94      0.90       267
          2       0.93      0.97      0.95       181

avg / total       0.89      0.89      0.87       490



In [109]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
print(confusion_matrix(y_train,predict))
print("Accuracy Score: %f" % accuracy_score(y_train,predict))
print("F1 Score: %f" % f1_score(y_train,predict))

[[ 10  32   0]
 [  1 252  14]
 [  0   6 175]]
Accuracy Score: 0.891837
F1 Score: 0.874815


  sample_weight=sample_weight)


# Now predicting the Final Rating of RM on the basis of Loan given to good and bad customer

In [115]:
""" Now predicting the RM rating"""
loan_final = pd.read_csv('C:/vikas/data/ML/loan_evaluation02_results.csv')
loan_final = loan_final[['rm_employee_id','Loan Status']]
loan_final.rename(columns = {'rm_employee_id':'employee_id'},inplace = True)
loan_final = loan_final.groupby(['employee_id','Loan Status'])['Loan Status'].count().unstack()
loan_final.reset_index(inplace=True)
loan_final.rename(columns = {0:'bad',1:'good'},inplace = True)
loan_final['good'].fillna(0,inplace=True)
loan_final['bad'].fillna(0,inplace=True)
loan_final['Success_rate'] = loan_final['good']/(loan_final['bad'] + loan_final['good'])
loan_final['Success_rate'] = loan_final['Success_rate'].apply(lambda x: round(x,3))
loan_final.head()

Loan Status,employee_id,bad,good,Success_rate
0,0,1,5,0.833
1,2,0,4,1.0
2,3,0,5,1.0
3,4,0,7,1.0
4,6,0,7,1.0


In [132]:
""" Join both loan with rm """
def fix_job(string):
    if string != 0:
        return 1
    else :
        return 0

def target(string):
    if string == 'MEDIUM' :
        return 1
    elif string == 'HIGH' :
        return 2
    else :
        return 0

rm_final = pd.read_csv('C:/vikas/data/ML/rm_evaluation_02.csv')
rm_final.rename(columns = {'id':'employee_id'},inplace= True)
loan_rm_final = pd.merge(rm_final,loan_final, on='employee_id', how='left')
loan_rm_final['target_achievement'] = loan_rm_final['target_achievement'].apply(target)
loan_rm_final['last_job_type'] = loan_rm_final['last_job_type'].apply(fix_job)
loan_rm_final['last_job_industry'] = loan_rm_final['last_job_industry'].apply(fix_job)
loan_rm_final.drop('overall_rating',axis=1,inplace=True)
loan_rm_final.head()

Unnamed: 0,employee_id,number_of_jobs,last_job_type,last_job_industry,experience_in_dbs,experience_outside_dbs,experience_in_fin,experience_in_insurance,target_achievement,bad,good,Success_rate
0,1921,0,1,1,6,0,6,0,1,0,8,1
1,2839,4,1,1,10,7,12,5,2,0,5,1
2,1942,0,1,1,3,0,3,0,1,0,4,1
3,1202,0,1,1,10,0,10,0,1,0,6,1
4,3761,0,1,1,5,0,5,0,1,0,6,1


In [133]:
""" Final Prediction """
predict = logmodel.predict(loan_rm_final)
loan_rm_final['overall_rating'] = predict
loan_rm_final.head()

Unnamed: 0,employee_id,number_of_jobs,last_job_type,last_job_industry,experience_in_dbs,experience_outside_dbs,experience_in_fin,experience_in_insurance,target_achievement,bad,good,Success_rate,overall_rating
0,1921,0,1,1,6,0,6,0,1,0,8,1,1
1,2839,4,1,1,10,7,12,5,2,0,5,1,2
2,1942,0,1,1,3,0,3,0,1,0,4,1,0
3,1202,0,1,1,10,0,10,0,1,0,6,1,1
4,3761,0,1,1,5,0,5,0,1,0,6,1,0


In [137]:
loan_rm_final[['employee_id','overall_rating']].to_csv('C:/vikas/data/ML/final_output.csv', index=False)

# DONE!!!