In [39]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.decomposition import PCA

plt.style.use('ggplot')
%matplotlib inline

In [24]:
df = pd.read_csv('final_dataset.csv')
df = df.reset_index()
df.replace([np.inf, -np.inf], np.nan).dropna(axis=1)
df = df.dropna()
df.head()

Unnamed: 0,index,Match Runs,Total Runs,Ave,Career SR,HS,Opposition,Ground,m1,m2,m3,m4,m5,Records
0,0,19.0,66.0,14.16,73.91,36.0,England,Leeds,0.0,0.0,36.0,10.0,20.0,-10
1,1,31.0,85.0,16.57,82.26,36.0,England,Nottingham,0.0,36.0,10.0,20.0,19.0,-10
2,2,36.0,116.0,19.0,93.25,36.0,Sri Lanka,Nagpur,36.0,10.0,20.0,19.0,31.0,-10
3,3,53.0,152.0,22.77,100.49,53.0,Sri Lanka,Pune,10.0,20.0,19.0,31.0,36.0,-10
4,4,30.0,205.0,23.5,100.85,53.0,Sri Lanka,Margao,20.0,19.0,31.0,36.0,53.0,0


In [25]:
df.loc[(df['Match Runs'] >= 100), 'class'] = 6
df.loc[(df['Match Runs'] < 100) & (df['Match Runs'] >= 80), 'class'] = 5
df.loc[(df['Match Runs'] < 80) & (df['Match Runs'] >= 60), 'class'] = 4
df.loc[(df['Match Runs'] < 60) & (df['Match Runs'] >= 40), 'class'] = 3
df.loc[(df['Match Runs'] < 40) & (df['Match Runs'] >= 20), 'class'] = 2
df.loc[(df['Match Runs'] < 20), 'class'] = 1

In [26]:
df['class'].astype(int)

0        1
1        2
2        2
3        3
4        2
        ..
16856    4
16857    2
16858    1
16859    1
16860    1
Name: class, Length: 16822, dtype: int64

In [27]:
df.drop(['index'], axis=1, inplace=True)
df.to_csv('multi_8class_dataset.csv', index=False)

In [28]:
y = df['class']
df.drop(['Match Runs', 'class'], inplace=True, axis=1)
X = df
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1, shuffle=True)

In [29]:
df.head()

Unnamed: 0,Total Runs,Ave,Career SR,HS,Opposition,Ground,m1,m2,m3,m4,m5,Records
0,66.0,14.16,73.91,36.0,England,Leeds,0.0,0.0,36.0,10.0,20.0,-10
1,85.0,16.57,82.26,36.0,England,Nottingham,0.0,36.0,10.0,20.0,19.0,-10
2,116.0,19.0,93.25,36.0,Sri Lanka,Nagpur,36.0,10.0,20.0,19.0,31.0,-10
3,152.0,22.77,100.49,53.0,Sri Lanka,Pune,10.0,20.0,19.0,31.0,36.0,-10
4,205.0,23.5,100.85,53.0,Sri Lanka,Margao,20.0,19.0,31.0,36.0,53.0,0


In [55]:
col = ['Total Runs',  'Ave', 'Career SR', 'HS', 'm1', 'm2', 'm3', 'm4', 'm5', 'Records']
preprocessor = ColumnTransformer(transformers=[('mm', MinMaxScaler() , col), ('le1', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=25), ['Opposition']), ('le2', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=200), ['Ground']), ], remainder='passthrough')
df.head()

Unnamed: 0,Total Runs,Ave,Career SR,HS,Opposition,Ground,m1,m2,m3,m4,m5,Records
0,66.0,14.16,73.91,36.0,England,Leeds,0.0,0.0,36.0,10.0,20.0,-10
1,85.0,16.57,82.26,36.0,England,Nottingham,0.0,36.0,10.0,20.0,19.0,-10
2,116.0,19.0,93.25,36.0,Sri Lanka,Nagpur,36.0,10.0,20.0,19.0,31.0,-10
3,152.0,22.77,100.49,53.0,Sri Lanka,Pune,10.0,20.0,19.0,31.0,36.0,-10
4,205.0,23.5,100.85,53.0,Sri Lanka,Margao,20.0,19.0,31.0,36.0,53.0,0


In [56]:
model4 = GradientBoostingClassifier(n_estimators=800, max_depth=10, max_features='auto', learning_rate=0.1, verbose=1)
model = Pipeline(steps=[('scaler', preprocessor), ('model', model4)])
model.fit(X_train, y_train)
model.score(X_test, y_test), model.score(X_train, y_train)

      Iter       Train Loss   Remaining Time 
         1           1.4478            7.47m
         2           1.4024            7.63m
         3           1.3626            7.68m
         4           1.3202            7.84m
         5           1.2867            7.91m
         6           1.2540            7.87m
         7           1.2216            7.88m
         8           1.1939            8.00m
         9           1.1654            8.03m
        10           1.1322            8.08m
        20           0.8979            8.18m
        30           0.7612            8.76m
        40           0.6303            9.03m
        50           0.5286            9.09m
        60           0.4325            9.15m
        70           0.3464            9.21m
        80           0.2831            9.24m
        90           0.2358            9.19m
       100           0.1973            9.04m
       200           0.0411            7.12m
       300           0.0086            5.68m
       40

(0.42305407011289364, 1.0)

In [57]:
import pickle
data_new = {'model': model}
with open('pipeline.pkl','wb') as file:
    pickle.dump(data_new,file)