In [None]:
import pandas as pd
import copy

In [None]:
xls = pd.ExcelFile('Cricket Data Set.xlsx')
xls

In [None]:
print(len(xls.sheet_names))
xls.sheet_names

In [None]:
df_names = ['_'.join(sheet_names.lower().split()) + '_df' for sheet_names in xls.sheet_names]
print(len(df_names))
df_names

In [None]:
df_dict = {df_names[sheet_index] : pd.read_excel(xls,sheet_index) for sheet_index in range(len(df_names))}
df_dict.keys()

In [None]:
for df_name in df_names:
    print(df_name)
    display(df_dict[df_name].head(5))

In [None]:
df_dict['control_sheet_df'].head(30)

In [None]:
df_dict['control_sheet_df'].rename({'Unnamed: 0': 'Shot Type'}, axis=1, inplace=True)
df_dict['control_sheet_df'].drop('Unnamed: 1',axis = 1, inplace = True)

In [None]:
df_dict['control_sheet_df'].head(30)

In [None]:
df_dict['control_sheet_df'].shape

In [None]:
for row_index in range(8,df_dict['control_sheet_df'].shape[0],9):
    print(row_index)

In [None]:
df_dict['control_sheet_df'].isna().sum()

In [None]:
df_dict['control_sheet_df'] = df_dict['control_sheet_df'].dropna(how='all') 
df_dict['control_sheet_df'].isna().sum()

In [None]:
df_dict['parameters_df']

In [None]:
shot_type = df_dict['control_sheet_df'].pop('Shot Type')

In [None]:
shot_type

In [None]:
shot_type.value_counts()

In [None]:
shot_type.isna().sum()

In [None]:
shot_type_temp = copy.deepcopy(shot_type)
shot_type_temp

In [None]:
for index,value in shot_type_temp.items():
    if type(value) != str:
        shot_type_temp[index] = shot_type_temp[index-1] 
shot_type_temp

In [None]:
shot_type_temp.value_counts()

In [None]:
shot_type = shot_type_temp
shot_type.value_counts()

In [None]:
df_dict['control_sheet_df'].shape

In [None]:
ball_type_df = df_dict['parameters_df'].loc[:,['Ball Type','ball symbol']].dropna()
ball_type_dict = dict(zip(ball_type_df['ball symbol'],ball_type_df['Ball Type']))
ball_type_dict

In [None]:
print(df_dict['control_sheet_df'].columns)
df_dict['control_sheet_df'].rename(columns = {'Sl':'SL'}, inplace = True)
df_dict['control_sheet_df'].columns

In [None]:
def generate_ballwise_df(df = df_dict['control_sheet_df']):
    for col_index in range(0,df.shape[1],3):
        ball_symbol = df.iloc[:,col_index:col_index+3].columns[0]
        ball_type = ball_type_dict[ball_symbol]
    
        df_dict[f'{ball_type}_shots_df'] = df.iloc[:,col_index:col_index+3]
        df_dict[f'{ball_type}_shots_df'].columns = ['Runs Scored', 'Correct execution of shot according to the ball', 'Effectiveness of the shot execution']
        df_dict[f'{ball_type}_shots_df'].insert(0, 'Ball Type', ball_symbol)
        df_dict[f'{ball_type}_shots_df'].insert(1,'Shot Type', shot_type)

In [None]:
generate_ballwise_df()

In [None]:
for df_name in df_dict.keys():
    print(df_name)
    display(df_dict[df_name].head(10))

In [None]:
df_dict['analysis_parameters_df'] = df_dict['analysis_parameters_df'].dropna(how = 'all')
df_dict['analysis_parameters_df'].reset_index(drop = True, inplace = True)

In [None]:
pd.set_option('display.max_colwidth', None)
df_dict['analysis_parameters_df']

In [None]:
def unique_values(df):
    unique_vals = [(col,list(df[col].unique())) for col in df.columns if col!='Sno']
    return unique_vals

In [None]:
for df_name in df_dict.keys():
    if 'vs' not in df_name:
        continue 
    print(df_name)
    for col,vals in unique_values(df_dict[df_name]):
        print(f'{col} : {vals}')
    print()

In [None]:
for df_name in df_dict.keys():
    if 'vs' in df_name:
        continue 
    print(df_name)
    for col,vals in unique_values(df_dict[df_name]):
        print(f'{col} : {vals}')
    print()

In [None]:
for df_name in df_dict.keys():
    if 'vs' not in df_name:
        continue
    print(df_name)
    print(df_dict[df_name][df_dict[df_name]['Dismissal kind'].notnull()])

In [None]:
df_dict['ind_vs_afg_df']['Dismissal kind'] = df_dict['ind_vs_afg_df']['Dismissal kind'].replace('Leg Stump', 'B')
df_dict['ind_vs_afg_df']['Dismissal kind'] = df_dict['ind_vs_afg_df']['Dismissal kind'].replace('Stump Out', 'St')

In [None]:
df_dict['ind_vs_afg_df'].loc[df_dict['ind_vs_afg_df']['Dismissal kind'].notnull() & df_dict['ind_vs_afg_df']['Wicket'].isna(),'Wicket'] = 'Wicket'

In [None]:
def clean_missing_values(df):
    cols_nan = [col_name for col_name, nan_count in df.isna().sum().items() if nan_count!=0 ]
    unary_cols = ['Pitch Type', 'Match ID']
    binary_cols = ['Wide', 'No ball', 'Wicket', 'Hit the Bat']
    
    for col in cols_nan:
        if col in unary_cols:
            df.drop(col, axis = 1, inplace = True)
            continue
        elif col in binary_cols:
            df[col] = df[col].notnull().astype('float')
            continue
        
        df[col].fillna(value = 0, inplace = True)
    
    return df

In [None]:
def drop_incorrect_data(df,column_name,child_values,parent_values):
    if all(value in parent_values for x in child_values):
        return

    incorrect_values = [val for val in child_values if val not in parent_values]
    df.drop(df[df[column_name].isin(incorrect_values)].index, inplace=True)
    return

In [None]:
def validate_column_values(df):
    ball_symbol_list = df_dict['parameters_df']['ball symbol'].dropna().tolist()
    ball_symbol_list = list(map(lambda x: x.upper(),ball_symbol_list))
    
    shot_symbol_list = df_dict['parameters_df']['shot symbol'].dropna().tolist()
    shot_symbol_list = list(map(lambda x: x.upper(),shot_symbol_list))
    
    dismissal_symbol_list = df_dict['parameters_df']['dismisal symbol'].dropna().tolist()
    dismissal_symbol_list = list(map(lambda x: x.upper(),dismissal_symbol_list))
    dismissal_dict = dict(zip(df_dict['parameters_df']['Dismissal Kind'].dropna(),df_dict['parameters_df']['dismisal symbol'].dropna()))
    
    control_values_list = [0.0,1.0]
    effectiveness_values_list = [x/10.0 for x in range(0,11)] + [-1.0]

    for col_name, col_values in unique_values(df):
        if 'shot' in col_name.lower():
            col_values = list(map(lambda x: x.upper(), col_values))
            drop_incorrect_data(df,col_name,col_values,shot_symbol_list)
        elif col_name.lower() == 'ball type':
            col_values = list(map(lambda x: x.upper(), col_values))
            drop_incorrect_data(df,col_name,col_values,ball_symbol_list)
        elif 'control' in col_name.lower():
            df[col_name] = df[col_name].replace(-1.0,0.0) if -1.0 in col_values else df[col_name]
            col_values.remove(-1.0) if -1.0 in col_values else col_values
            drop_incorrect_data(df,col_name,col_values,control_values_list)
        elif 'effectiveness' in col_name.lower():
            drop_incorrect_data(df,col_name,col_values,effectiveness_values_list)
        elif 'dismissal' in col_name.lower():
            col_values.remove(0)
            df[col_name] = df[col_name].replace('C','Ct') if 'C' in col_values else df[col_name]
            col_values = ['Ct' if col_value == 'C' else col_value for col_value in col_values]
            
            for col_value in col_values:
                df[col_name] = df[col_name].replace(col_value,dismissal_dict[col_value]) if len(col_value) > 3 else df[col_name
                                                                                                                      ]
            col_values = [dismissal_dict[col_value] if len(col_value) > 3 else col_value for col_value in col_values]
            col_values = [col_value.upper() for col_value in col_values]
            drop_incorrect_data(df,col_name,col_values,dismissal_symbol_list)
    
    return df

In [None]:
def clean_df(df):
    df.drop('Sno', axis = 1, inplace = True)
    df = df.dropna(how = 'all')
    df.reset_index(drop = True, inplace = True)
    df.rename(columns = {'Player Name':'Batsman Name', 'Player ID':'Batsman Player ID', 'Bowler':'Bowler Name', 'Player ID.1':'Bowler Player ID','X':'Effectiveness'}, inplace = True)
    
    non_nan_df = clean_missing_values(df)    
    validated_df = validate_column_values(non_nan_df)
    
    return validated_df

In [None]:
length = 0

for df_name in df_dict.keys():
    if 'vs' not in df_name:
        continue 
    length += df_dict[df_name].shape[0]

print(length)

In [None]:
matches_df = pd.DataFrame()

for df_name in df_dict.keys():
    if 'vs' not in df_name:
        continue
    cleaned_df = clean_df(df_dict[df_name])
    matches_df = pd.concat([matches_df, cleaned_df], ignore_index=True)

matches_df.shape

In [None]:
for df_name in df_dict.keys():
    if 'vs' not in df_name:
        continue
    print(df_name)
    for col_name, col_values in unique_values(df_dict[df_name]):
        print(col_name, col_values)
    print()

In [None]:
matches_df

In [None]:
for col_name, col_values in unique_values(matches_df):
    print(col_name,col_values)

In [None]:
matches_df.nunique()

In [None]:
player_info = dict(zip(matches_df['Batsman Name'],matches_df['Batsman Player ID']))
player_info

In [None]:
len(player_info)

In [None]:
matches_df['Batsman Name'].replace('Kedar Jadhav','Kedhar Jadhav', inplace = True)
matches_df['Batsman Name'].replace('Shikar Dhawan','Shikhar Dhawan', inplace = True)

In [None]:
matches_df.nunique()

In [None]:
df_dict['analysis_parameters_df']

In [None]:
def off_side_score(df = matches_df):
    off_side_shots = ['DR','C','RS']
    
    off_side_shots_count = df[df['Type of Shot'].isin(off_side_shots)].groupby('Batsman Name').agg(off_side_shots_count = ('Type of Shot', 'count'), sum_effectiveness = ('Effectiveness', 'sum'))
    grouped_df = df.groupby('Batsman Name').agg(total_no_of_balls_faced = ('Type of Shot', 'count'))
    
    off_side_score_df = off_side_shots_count.merge(grouped_df,on = 'Batsman Name')
    off_side_score_df = off_side_score_df.reset_index()
    
    off_side_score_df['Highest percentage of off side shots'] = (off_side_score_df['off_side_shots_count']/off_side_score_df['total_no_of_balls_faced'])*100
    off_side_score_df['Highest effectiveness average of off side shots'] = off_side_score_df['sum_effectiveness']/off_side_score_df['off_side_shots_count']
    off_side_score_df['off_side_score'] = (off_side_score_df['Highest percentage of off side shots']/100) * off_side_score_df['Highest effectiveness average of off side shots']
    
    off_side_score_df = off_side_score_df.sort_values(by=['off_side_score'], ascending=False)
    
    return off_side_score_df.loc[:,['Batsman Name','Highest percentage of off side shots','Highest effectiveness average of off side shots','off_side_score']]

In [None]:
off_side_score()

In [None]:
def leg_side_score(df = matches_df):
    leg_side_shots = ['P','ODR','LG','F','S']
    
    leg_side_shots_count = df[df['Type of Shot'].isin(leg_side_shots)].groupby('Batsman Name').agg(leg_side_shots_count = ('Type of Shot', 'count'), sum_effectiveness = ('Effectiveness', 'sum'))
    grouped_df = df.groupby('Batsman Name').agg(total_no_of_balls_faced = ('Type of Shot', 'count'))
    
    leg_side_score_df = leg_side_shots_count.merge(grouped_df,on = 'Batsman Name')
    leg_side_score_df = leg_side_score_df.reset_index()
    
    leg_side_score_df['Highest percentage of leg side shots'] = (leg_side_score_df['leg_side_shots_count']/leg_side_score_df['total_no_of_balls_faced'])*100
    leg_side_score_df['Highest effectiveness average of leg side shots'] = leg_side_score_df['sum_effectiveness']/leg_side_score_df['leg_side_shots_count']
    leg_side_score_df['leg_side_score'] = (leg_side_score_df['Highest percentage of leg side shots']/100) * leg_side_score_df['Highest effectiveness average of leg side shots']
    
    leg_side_score_df = leg_side_score_df.sort_values(by=['leg_side_score'], ascending=False)
    
    return leg_side_score_df.loc[:,['Batsman Name','Highest percentage of leg side shots','Highest effectiveness average of leg side shots','leg_side_score']]

In [None]:
leg_side_score()

In [None]:
def bouncy_track_score(df = matches_df):
    bouncy_track_shots = ['D','P','C','L']
    
    bouncy_track_shots_count = df[df['Type of Shot'].isin(bouncy_track_shots)].groupby('Batsman Name').agg(bouncy_track_shots_count = ('Type of Shot', 'count'), sum_effectiveness = ('Effectiveness', 'sum'))
    grouped_df = df.groupby('Batsman Name').agg(total_no_of_balls_faced = ('Type of Shot', 'count'))
    
    bouncy_track_score_df = bouncy_track_shots_count.merge(grouped_df,on = 'Batsman Name')
    bouncy_track_score_df = bouncy_track_score_df.reset_index()
    
    bouncy_track_score_df['Highest percentage of bouncy track shots'] = (bouncy_track_score_df['bouncy_track_shots_count']/bouncy_track_score_df['total_no_of_balls_faced'])*100
    bouncy_track_score_df['Highest effectiveness average of bouncy track shots'] = bouncy_track_score_df['sum_effectiveness']/bouncy_track_score_df['bouncy_track_shots_count']
    bouncy_track_score_df['bouncy_track_score'] = (bouncy_track_score_df['Highest percentage of bouncy track shots']/100) * bouncy_track_score_df['Highest effectiveness average of bouncy track shots']
    
    bouncy_track_score_df = bouncy_track_score_df.sort_values(by=['bouncy_track_score'], ascending=False)
    
    return bouncy_track_score_df.loc[:,['Batsman Name','Highest percentage of bouncy track shots','Highest effectiveness average of bouncy track shots','bouncy_track_score']]

In [None]:
bouncy_track_score()

In [None]:
def aggressive_shots_score(df = matches_df):
    aggressive_shots = ['Dr', 'C', 'P', 'Sw', 'RS', 'ODR', 'F', 'R', 'LG', 'SC']
    
    aggressive_shots_count = df[df['Type of Shot'].isin(aggressive_shots)].groupby('Batsman Name').agg(aggressive_shots_count = ('Type of Shot', 'count'), sum_effectiveness = ('Effectiveness', 'sum'))
    grouped_df = df.groupby('Batsman Name').agg(total_no_of_balls_faced = ('Type of Shot', 'count'))
    
    aggressive_score_df = aggressive_shots_count.merge(grouped_df,on = 'Batsman Name')
    aggressive_score_df = aggressive_score_df.reset_index()
    
    aggressive_score_df['Highest percentage of aggressive shots'] = (aggressive_score_df['aggressive_shots_count']/aggressive_score_df['total_no_of_balls_faced'])*100
    aggressive_score_df['Highest effectiveness average of aggressive shots'] = aggressive_score_df['sum_effectiveness']/aggressive_score_df['aggressive_shots_count']
    aggressive_score_df['aggressive_score'] = (aggressive_score_df['Highest percentage of aggressive shots']/100) * aggressive_score_df['Highest effectiveness average of aggressive shots']
    
    aggressive_score_df = aggressive_score_df.sort_values(by=['aggressive_score'], ascending=False)
    
    return aggressive_score_df.loc[:,['Batsman Name','Highest percentage of aggressive shots','Highest effectiveness average of aggressive shots','aggressive_score']]

In [None]:
aggressive_shots_score()

In [None]:
def defense_score(df = matches_df):
    defense_shots = ['D','L']
    
    defense_shots_count = df[df['Type of Shot'].isin(defense_shots)].groupby('Batsman Name').agg(defense_shots_count = ('Type of Shot', 'count'), sum_effectiveness = ('Effectiveness', 'sum'))
    grouped_df = df.groupby('Batsman Name').agg(total_no_of_balls_faced = ('Type of Shot', 'count'))
    
    defense_score_df = defense_shots_count.merge(grouped_df,on = 'Batsman Name')
    defense_score_df = defense_score_df.reset_index()
    
    defense_score_df['Highest percentage of defense shots'] = (defense_score_df['defense_shots_count']/defense_score_df['total_no_of_balls_faced'])*100
    defense_score_df['Highest effectiveness average of defense shots'] = defense_score_df['sum_effectiveness']/defense_score_df['defense_shots_count']
    defense_score_df['defense_score'] = (defense_score_df['Highest percentage of defense shots']/100) * defense_score_df['Highest effectiveness average of defense shots']
    
    defense_score_df = defense_score_df.sort_values(by=['defense_score'], ascending=False)
    
    return defense_score_df.loc[:,['Batsman Name','Highest percentage of defense shots','Highest effectiveness average of defense shots','defense_score']]

In [None]:
defense_score()

In [None]:
def spin_ball_score(df = matches_df):
    spin_ball_shots = ['D', 'Dr', 'C', 'P', 'Sw', 'RS', 'ODR', 'F', 'R', 'LG', 'SC']
    spin_ball = ['GY', 'LB']
    
    spin_ball_shots_count = df[df['Type of Shot'].isin(spin_ball_shots) & df['Ball type'].isin(spin_ball)].groupby('Batsman Name').agg(spin_ball_shots_count = ('Type of Shot', 'count'), sum_effectiveness = ('Effectiveness', 'sum'))
    grouped_df = df.groupby('Batsman Name').agg(total_no_of_balls_faced = ('Type of Shot', 'count'))
    
    spin_ball_score_df = spin_ball_shots_count.merge(grouped_df,on = 'Batsman Name')
    spin_ball_score_df = spin_ball_score_df.reset_index()
    
    spin_ball_score_df['Highest percentage of spin ball shots'] = (spin_ball_score_df['spin_ball_shots_count']/spin_ball_score_df['total_no_of_balls_faced'])*100
    spin_ball_score_df['Highest effectiveness average of spin ball shots'] = spin_ball_score_df['sum_effectiveness']/spin_ball_score_df['spin_ball_shots_count']
    spin_ball_score_df['spin_ball_score'] = (spin_ball_score_df['Highest percentage of spin ball shots']/100) * spin_ball_score_df['Highest effectiveness average of spin ball shots']
    
    spin_ball_score_df = spin_ball_score_df.sort_values(by=['spin_ball_score'], ascending=False)
    
    return spin_ball_score_df.loc[:,['Batsman Name','Highest percentage of spin ball shots','Highest effectiveness average of spin ball shots','spin_ball_score']]

In [None]:
spin_ball_score()

In [None]:
def control_score(df = matches_df):
    shots = ['D', 'Dr', 'C', 'P', 'Sw', 'RS', 'ODR', 'F', 'R', 'LG', 'SC', 'L']
    
    shot_count_df = df[df['Control'] == 1.0].groupby('Batsman Name').agg(controlled_shots_count = ('Type of Shot', 'count'), sum_effectiveness = ('Effectiveness', 'sum'))
    grouped_df = df.groupby('Batsman Name').agg(total_no_of_balls_faced = ('Type of Shot', 'count'))
    
    control_score_df = shot_count_df.merge(grouped_df, on ='Batsman Name')
    control_score_df = control_score_df.reset_index()
    
    control_score_df['Highest percentage of controlled shots'] = (control_score_df['controlled_shots_count']/control_score_df['total_no_of_balls_faced'])*100
    control_score_df['Highest effectiveness average of controlled shots'] = control_score_df['sum_effectiveness']/control_score_df['controlled_shots_count']
    control_score_df['control_score'] = (control_score_df['Highest percentage of controlled shots']/100) * control_score_df['Highest effectiveness average of controlled shots']
    
    control_score_df = control_score_df.sort_values(by=['control_score'], ascending=False)
    
    return control_score_df.loc[:,['Batsman Name','Highest percentage of controlled shots','Highest effectiveness average of controlled shots','control_score']]

In [None]:
control_score()

In [None]:
matches_df.info()

In [None]:
matches_df['Control'].unique()

In [None]:
matches_df['Control'] = matches_df['Control'].astype('float')

In [None]:
matches_df.info()

In [None]:
matches_df['Control'].unique()

In [None]:
ball_type_dict = dict(zip(df_dict['parameters_df']['ball symbol'],df_dict['parameters_df']['Ball Type']))
ball_type_dict

In [None]:
shot_count_df = matches_df.groupby(['Ball type', 'Type of Shot']).agg(Count = ('Type of Shot','count'))
shot_count_df = shot_count_df.reset_index()
shot_count_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

for ball_type in list(matches_df['Ball type'].unique()):
    plt.figure(figsize = (15,5))
    sns.set_style('whitegrid')
    plt.title(f"Shot Type Spread vs {ball_type_dict[ball_type]} ball")
    sns.countplot(x = 'Type of Shot', data = matches_df[matches_df['Ball type'] == ball_type])

In [None]:
matches_df.columns

In [None]:
original_matches_df = copy.deepcopy(matches_df)

In [None]:
original_matches_df

In [None]:
df_dict['off_side_score_df'] = off_side_score()
df_dict['leg_side_score_df'] = leg_side_score()
df_dict['bouncy_track_score_df'] = bouncy_track_score()
df_dict['aggressive_score_df'] = aggressive_shots_score()
df_dict['defense_score_df'] = defense_score()
df_dict['spin_ball_score_df'] = spin_ball_score()
df_dict['control_score_df'] = control_score()

In [None]:
for df_name in df_dict.keys():
    if 'score' not in df_name:
        continue
    score_col = df_name[:-3]
    matches_df = pd.merge(matches_df, df_dict[df_name].loc[:,['Batsman Name',score_col]], on = 'Batsman Name')

In [None]:
matches_df

In [None]:
categorical_variables = []
continuous_variables = []

for column in matches_df.columns:
    if matches_df[column].dtype != 'object':
        continuous_variables.append(column)
        continue
    categorical_variables.append(column)

print(categorical_variables)
print(continuous_variables)   

In [None]:
from scipy.stats import chi2_contingency

alpha = 0.05

for col in categorical_variables:
    contingency_table = pd.crosstab(matches_df[col], matches_df['Effectiveness'])
    if contingency_table.empty:
        continue
    chi2, p_value, _, _ = chi2_contingency(contingency_table)
    print(f"Column {col}: chi2={chi2}, p_value={p_value}")
    if p_value > alpha:
        print(f"Column {col} is not significantly related to the target and will be removed.")
        matches_df.drop(col, axis=1, inplace=True)
    print()

matches_df

In [None]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
for col in matches_df.columns:
    if col not in categorical_variables:
        continue
    matches_df[col] = le.fit_transform(matches_df[col].astype(str))

In [None]:
matches_df

In [None]:
for col_name, col_values in unique_values(matches_df):
    print(col_name,col_values)

In [None]:
corr = matches_df.corr() 

plt.figure(figsize=(10, 10), dpi = 600) 
 
sns.heatmap(corr, cmap="Blues", annot=True, annot_kws={"size": 8})

plt.tick_params(axis = 'x', labelsize = 12) 
plt.tick_params(axis = 'y', labelsize = 12) 

In [None]:
X = matches_df.drop("Effectiveness", axis=1)
y = matches_df['Effectiveness']

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaled_X = scaler.fit_transform(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.20, random_state=42)

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor 
import catboost as cb
from sklearn import svm
from sklearn.metrics import mean_squared_error
import numpy as np

reg1 = svm.SVR(kernel = 'rbf')
reg2 = LinearRegression()
reg3 = RandomForestRegressor()
reg4 = XGBRegressor()
reg5 = LinearRegression()
reg6 = GradientBoostingRegressor()
reg7 = DecisionTreeRegressor()
reg8 = cb.CatBoostRegressor()


score1 = cross_val_score(reg1,X_train,y_train,cv = 5, scoring = 'neg_mean_absolute_error')
score2 = cross_val_score(reg2,X_train,y_train,cv = 5, scoring = 'neg_mean_absolute_error')
score3 = cross_val_score(reg3,X_train,y_train,cv = 5, scoring = 'neg_mean_absolute_error')
score4 = cross_val_score(reg4,X_train,y_train,cv = 5, scoring = 'neg_mean_absolute_error')
score5 = cross_val_score(reg5,X_train,y_train,cv = 5, scoring = 'neg_mean_absolute_error')
score6 = cross_val_score(reg6,X_train,y_train,cv = 5, scoring = 'neg_mean_absolute_error')
score7 = cross_val_score(reg7,X_train,y_train,cv = 5, scoring = 'neg_mean_absolute_error')
score8 = cross_val_score(reg8,X_train,y_train,cv = 5, scoring = 'neg_mean_absolute_error')


print('SVM mean absolute error: ',np.mean(-1*score1))
print('Linear Regression mean absolute error: ',np.mean(-1*score2))
print('Random Forest mean absolute error: ',np.mean(-1*score3))
print('XGBoost mean absolute error: ',np.mean(-1*score4))
print('Linear Regression mean absolute error: ',np.mean(-1*score5))
print('GradientBoosting Regressor mean absolute error: ',np.mean(-1*score6))
print('Decision Tree Regressor mean absolute error: ',np.mean(-1*score7))
print('Cat Boost Regressor mean absolute error: ',np.mean(-1*score8))

Learning rate set to 0.041492
0:	learn: 0.3644924	total: 1.18ms	remaining: 1.18s
1:	learn: 0.3562463	total: 2.15ms	remaining: 1.07s
2:	learn: 0.3481300	total: 3.08ms	remaining: 1.02s
3:	learn: 0.3400650	total: 3.81ms	remaining: 950ms
4:	learn: 0.3344886	total: 4.47ms	remaining: 890ms
5:	learn: 0.3274860	total: 5.32ms	remaining: 882ms
6:	learn: 0.3208305	total: 6.24ms	remaining: 885ms
7:	learn: 0.3145653	total: 7.1ms	remaining: 880ms
8:	learn: 0.3087422	total: 7.94ms	remaining: 874ms
9:	learn: 0.3038810	total: 8.78ms	remaining: 869ms
10:	learn: 0.2986821	total: 9.6ms	remaining: 863ms
11:	learn: 0.2932185	total: 10.4ms	remaining: 860ms
12:	learn: 0.2880820	total: 11.3ms	remaining: 857ms
13:	learn: 0.2834210	total: 12.2ms	remaining: 862ms
14:	learn: 0.2792252	total: 13.1ms	remaining: 859ms
15:	learn: 0.2752747	total: 14ms	remaining: 860ms
16:	learn: 0.2719233	total: 14.8ms	remaining: 856ms
17:	learn: 0.2679534	total: 15.7ms	remaining: 856ms
18:	learn: 0.2649640	total: 16.6ms	remaining: 85

191:	learn: 0.1212670	total: 162ms	remaining: 681ms
192:	learn: 0.1202501	total: 163ms	remaining: 681ms
193:	learn: 0.1200714	total: 164ms	remaining: 681ms
194:	learn: 0.1198589	total: 165ms	remaining: 679ms
195:	learn: 0.1195920	total: 165ms	remaining: 679ms
196:	learn: 0.1191968	total: 166ms	remaining: 678ms
197:	learn: 0.1190701	total: 167ms	remaining: 677ms
198:	learn: 0.1188800	total: 168ms	remaining: 676ms
199:	learn: 0.1182769	total: 169ms	remaining: 676ms
200:	learn: 0.1181574	total: 170ms	remaining: 675ms
201:	learn: 0.1180137	total: 171ms	remaining: 674ms
202:	learn: 0.1174778	total: 171ms	remaining: 673ms
203:	learn: 0.1173442	total: 172ms	remaining: 672ms
204:	learn: 0.1166366	total: 173ms	remaining: 671ms
205:	learn: 0.1166263	total: 174ms	remaining: 669ms
206:	learn: 0.1163631	total: 175ms	remaining: 669ms
207:	learn: 0.1153061	total: 175ms	remaining: 668ms
208:	learn: 0.1151863	total: 176ms	remaining: 667ms
209:	learn: 0.1141318	total: 177ms	remaining: 666ms
210:	learn: 

386:	learn: 0.0651001	total: 321ms	remaining: 509ms
387:	learn: 0.0648955	total: 322ms	remaining: 508ms
388:	learn: 0.0648726	total: 323ms	remaining: 507ms
389:	learn: 0.0647550	total: 324ms	remaining: 507ms
390:	learn: 0.0645175	total: 325ms	remaining: 506ms
391:	learn: 0.0644560	total: 326ms	remaining: 505ms
392:	learn: 0.0642954	total: 327ms	remaining: 505ms
393:	learn: 0.0641662	total: 328ms	remaining: 504ms
394:	learn: 0.0637947	total: 328ms	remaining: 503ms
395:	learn: 0.0637622	total: 329ms	remaining: 502ms
396:	learn: 0.0634350	total: 330ms	remaining: 501ms
397:	learn: 0.0634127	total: 331ms	remaining: 500ms
398:	learn: 0.0633139	total: 332ms	remaining: 500ms
399:	learn: 0.0632393	total: 332ms	remaining: 499ms
400:	learn: 0.0630509	total: 333ms	remaining: 498ms
401:	learn: 0.0630118	total: 334ms	remaining: 497ms
402:	learn: 0.0629029	total: 335ms	remaining: 496ms
403:	learn: 0.0627220	total: 336ms	remaining: 496ms
404:	learn: 0.0626861	total: 337ms	remaining: 495ms
405:	learn: 

570:	learn: 0.0484182	total: 473ms	remaining: 355ms
571:	learn: 0.0483562	total: 474ms	remaining: 355ms
572:	learn: 0.0482832	total: 475ms	remaining: 354ms
573:	learn: 0.0482048	total: 476ms	remaining: 353ms
574:	learn: 0.0481274	total: 477ms	remaining: 352ms
575:	learn: 0.0480407	total: 478ms	remaining: 352ms
576:	learn: 0.0479695	total: 479ms	remaining: 351ms
577:	learn: 0.0479108	total: 479ms	remaining: 350ms
578:	learn: 0.0478448	total: 480ms	remaining: 349ms
579:	learn: 0.0477430	total: 481ms	remaining: 348ms
580:	learn: 0.0476695	total: 482ms	remaining: 348ms
581:	learn: 0.0476462	total: 483ms	remaining: 347ms
582:	learn: 0.0476173	total: 484ms	remaining: 346ms
583:	learn: 0.0475187	total: 485ms	remaining: 345ms
584:	learn: 0.0474292	total: 485ms	remaining: 344ms
585:	learn: 0.0473576	total: 486ms	remaining: 344ms
586:	learn: 0.0473471	total: 487ms	remaining: 343ms
587:	learn: 0.0471465	total: 488ms	remaining: 342ms
588:	learn: 0.0470348	total: 489ms	remaining: 341ms
589:	learn: 

769:	learn: 0.0370240	total: 634ms	remaining: 189ms
770:	learn: 0.0370086	total: 634ms	remaining: 188ms
771:	learn: 0.0369201	total: 635ms	remaining: 188ms
772:	learn: 0.0369178	total: 636ms	remaining: 187ms
773:	learn: 0.0368399	total: 637ms	remaining: 186ms
774:	learn: 0.0367502	total: 638ms	remaining: 185ms
775:	learn: 0.0367139	total: 639ms	remaining: 184ms
776:	learn: 0.0367064	total: 640ms	remaining: 184ms
777:	learn: 0.0367043	total: 641ms	remaining: 183ms
778:	learn: 0.0366486	total: 641ms	remaining: 182ms
779:	learn: 0.0365520	total: 642ms	remaining: 181ms
780:	learn: 0.0364822	total: 643ms	remaining: 180ms
781:	learn: 0.0364254	total: 644ms	remaining: 180ms
782:	learn: 0.0364231	total: 645ms	remaining: 179ms
783:	learn: 0.0363881	total: 646ms	remaining: 178ms
784:	learn: 0.0363860	total: 646ms	remaining: 177ms
785:	learn: 0.0362944	total: 647ms	remaining: 176ms
786:	learn: 0.0362746	total: 648ms	remaining: 175ms
787:	learn: 0.0362579	total: 649ms	remaining: 175ms
788:	learn: 

972:	learn: 0.0274937	total: 799ms	remaining: 22.2ms
973:	learn: 0.0274292	total: 800ms	remaining: 21.4ms
974:	learn: 0.0273667	total: 801ms	remaining: 20.5ms
975:	learn: 0.0273537	total: 802ms	remaining: 19.7ms
976:	learn: 0.0273348	total: 802ms	remaining: 18.9ms
977:	learn: 0.0273179	total: 803ms	remaining: 18.1ms
978:	learn: 0.0272899	total: 804ms	remaining: 17.2ms
979:	learn: 0.0272737	total: 805ms	remaining: 16.4ms
980:	learn: 0.0272342	total: 806ms	remaining: 15.6ms
981:	learn: 0.0272133	total: 807ms	remaining: 14.8ms
982:	learn: 0.0271754	total: 807ms	remaining: 14ms
983:	learn: 0.0271435	total: 808ms	remaining: 13.1ms
984:	learn: 0.0271282	total: 809ms	remaining: 12.3ms
985:	learn: 0.0271141	total: 810ms	remaining: 11.5ms
986:	learn: 0.0270594	total: 811ms	remaining: 10.7ms
987:	learn: 0.0270398	total: 812ms	remaining: 9.86ms
988:	learn: 0.0270212	total: 812ms	remaining: 9.04ms
989:	learn: 0.0269791	total: 813ms	remaining: 8.21ms
990:	learn: 0.0269384	total: 814ms	remaining: 7.

294:	learn: 0.0654338	total: 247ms	remaining: 591ms
295:	learn: 0.0653284	total: 248ms	remaining: 590ms
296:	learn: 0.0652281	total: 249ms	remaining: 589ms
297:	learn: 0.0650503	total: 250ms	remaining: 589ms
298:	learn: 0.0649047	total: 251ms	remaining: 588ms
299:	learn: 0.0647611	total: 251ms	remaining: 587ms
300:	learn: 0.0645065	total: 252ms	remaining: 586ms
301:	learn: 0.0644123	total: 253ms	remaining: 585ms
302:	learn: 0.0643214	total: 254ms	remaining: 584ms
303:	learn: 0.0641800	total: 255ms	remaining: 583ms
304:	learn: 0.0640977	total: 255ms	remaining: 582ms
305:	learn: 0.0639471	total: 256ms	remaining: 581ms
306:	learn: 0.0634632	total: 257ms	remaining: 580ms
307:	learn: 0.0633258	total: 258ms	remaining: 579ms
308:	learn: 0.0631186	total: 259ms	remaining: 579ms
309:	learn: 0.0629661	total: 260ms	remaining: 578ms
310:	learn: 0.0627773	total: 260ms	remaining: 577ms
311:	learn: 0.0626464	total: 261ms	remaining: 576ms
312:	learn: 0.0625247	total: 262ms	remaining: 575ms
313:	learn: 

501:	learn: 0.0407626	total: 412ms	remaining: 409ms
502:	learn: 0.0406868	total: 413ms	remaining: 408ms
503:	learn: 0.0406040	total: 414ms	remaining: 407ms
504:	learn: 0.0404819	total: 415ms	remaining: 406ms
505:	learn: 0.0403678	total: 415ms	remaining: 406ms
506:	learn: 0.0402813	total: 416ms	remaining: 405ms
507:	learn: 0.0402001	total: 417ms	remaining: 404ms
508:	learn: 0.0401151	total: 418ms	remaining: 403ms
509:	learn: 0.0400764	total: 419ms	remaining: 402ms
510:	learn: 0.0400192	total: 420ms	remaining: 401ms
511:	learn: 0.0399804	total: 420ms	remaining: 401ms
512:	learn: 0.0399207	total: 421ms	remaining: 400ms
513:	learn: 0.0398685	total: 422ms	remaining: 399ms
514:	learn: 0.0397919	total: 423ms	remaining: 398ms
515:	learn: 0.0397214	total: 424ms	remaining: 397ms
516:	learn: 0.0396647	total: 425ms	remaining: 397ms
517:	learn: 0.0395855	total: 426ms	remaining: 396ms
518:	learn: 0.0394924	total: 426ms	remaining: 395ms
519:	learn: 0.0393831	total: 427ms	remaining: 394ms
520:	learn: 

700:	learn: 0.0293744	total: 571ms	remaining: 244ms
701:	learn: 0.0293406	total: 572ms	remaining: 243ms
702:	learn: 0.0293114	total: 573ms	remaining: 242ms
703:	learn: 0.0292536	total: 574ms	remaining: 241ms
704:	learn: 0.0292133	total: 575ms	remaining: 240ms
705:	learn: 0.0291422	total: 575ms	remaining: 240ms
706:	learn: 0.0290760	total: 576ms	remaining: 239ms
707:	learn: 0.0290501	total: 577ms	remaining: 238ms
708:	learn: 0.0289967	total: 578ms	remaining: 237ms
709:	learn: 0.0289190	total: 579ms	remaining: 237ms
710:	learn: 0.0288760	total: 580ms	remaining: 236ms
711:	learn: 0.0288434	total: 581ms	remaining: 235ms
712:	learn: 0.0287967	total: 582ms	remaining: 234ms
713:	learn: 0.0287290	total: 583ms	remaining: 233ms
714:	learn: 0.0286851	total: 590ms	remaining: 235ms
715:	learn: 0.0286477	total: 592ms	remaining: 235ms
716:	learn: 0.0286190	total: 593ms	remaining: 234ms
717:	learn: 0.0285834	total: 594ms	remaining: 233ms
718:	learn: 0.0285519	total: 595ms	remaining: 233ms
719:	learn: 

878:	learn: 0.0227739	total: 723ms	remaining: 99.5ms
879:	learn: 0.0227220	total: 724ms	remaining: 98.8ms
880:	learn: 0.0227000	total: 725ms	remaining: 98ms
881:	learn: 0.0226548	total: 726ms	remaining: 97.1ms
882:	learn: 0.0226142	total: 727ms	remaining: 96.3ms
883:	learn: 0.0226021	total: 728ms	remaining: 95.5ms
884:	learn: 0.0225797	total: 729ms	remaining: 94.7ms
885:	learn: 0.0225502	total: 729ms	remaining: 93.8ms
886:	learn: 0.0225135	total: 730ms	remaining: 93ms
887:	learn: 0.0224851	total: 731ms	remaining: 92.2ms
888:	learn: 0.0224433	total: 732ms	remaining: 91.4ms
889:	learn: 0.0224223	total: 733ms	remaining: 90.6ms
890:	learn: 0.0224006	total: 734ms	remaining: 89.8ms
891:	learn: 0.0223710	total: 735ms	remaining: 89ms
892:	learn: 0.0223208	total: 735ms	remaining: 88.1ms
893:	learn: 0.0222969	total: 736ms	remaining: 87.3ms
894:	learn: 0.0222661	total: 737ms	remaining: 86.5ms
895:	learn: 0.0222467	total: 738ms	remaining: 85.7ms
896:	learn: 0.0222112	total: 739ms	remaining: 84.8ms

168:	learn: 0.1224870	total: 161ms	remaining: 793ms
169:	learn: 0.1222980	total: 162ms	remaining: 792ms
170:	learn: 0.1222812	total: 163ms	remaining: 790ms
171:	learn: 0.1221291	total: 164ms	remaining: 789ms
172:	learn: 0.1216842	total: 165ms	remaining: 788ms
173:	learn: 0.1215566	total: 166ms	remaining: 786ms
174:	learn: 0.1214179	total: 167ms	remaining: 785ms
175:	learn: 0.1213049	total: 168ms	remaining: 784ms
176:	learn: 0.1208728	total: 168ms	remaining: 783ms
177:	learn: 0.1207120	total: 169ms	remaining: 782ms
178:	learn: 0.1200503	total: 170ms	remaining: 781ms
179:	learn: 0.1184387	total: 171ms	remaining: 780ms
180:	learn: 0.1183535	total: 172ms	remaining: 779ms
181:	learn: 0.1180668	total: 173ms	remaining: 778ms
182:	learn: 0.1179821	total: 174ms	remaining: 778ms
183:	learn: 0.1178584	total: 175ms	remaining: 777ms
184:	learn: 0.1168339	total: 176ms	remaining: 777ms
185:	learn: 0.1155921	total: 177ms	remaining: 776ms
186:	learn: 0.1153033	total: 178ms	remaining: 775ms
187:	learn: 

344:	learn: 0.0717894	total: 327ms	remaining: 621ms
345:	learn: 0.0717138	total: 328ms	remaining: 620ms
346:	learn: 0.0715987	total: 329ms	remaining: 619ms
347:	learn: 0.0714544	total: 330ms	remaining: 618ms
348:	learn: 0.0712211	total: 331ms	remaining: 617ms
349:	learn: 0.0711020	total: 331ms	remaining: 616ms
350:	learn: 0.0709897	total: 332ms	remaining: 615ms
351:	learn: 0.0703170	total: 333ms	remaining: 613ms
352:	learn: 0.0702337	total: 334ms	remaining: 612ms
353:	learn: 0.0701660	total: 335ms	remaining: 611ms
354:	learn: 0.0700961	total: 336ms	remaining: 610ms
355:	learn: 0.0700221	total: 336ms	remaining: 609ms
356:	learn: 0.0699630	total: 337ms	remaining: 608ms
357:	learn: 0.0698637	total: 338ms	remaining: 607ms
358:	learn: 0.0696099	total: 339ms	remaining: 605ms
359:	learn: 0.0694804	total: 340ms	remaining: 604ms
360:	learn: 0.0693034	total: 341ms	remaining: 603ms
361:	learn: 0.0691795	total: 342ms	remaining: 602ms
362:	learn: 0.0687900	total: 343ms	remaining: 601ms
363:	learn: 

536:	learn: 0.0465571	total: 488ms	remaining: 421ms
537:	learn: 0.0465477	total: 489ms	remaining: 420ms
538:	learn: 0.0464158	total: 490ms	remaining: 419ms
539:	learn: 0.0462569	total: 491ms	remaining: 418ms
540:	learn: 0.0461170	total: 492ms	remaining: 417ms
541:	learn: 0.0460009	total: 493ms	remaining: 417ms
542:	learn: 0.0459445	total: 494ms	remaining: 416ms
543:	learn: 0.0458498	total: 495ms	remaining: 415ms
544:	learn: 0.0458056	total: 496ms	remaining: 414ms
545:	learn: 0.0456844	total: 496ms	remaining: 413ms
546:	learn: 0.0456363	total: 497ms	remaining: 412ms
547:	learn: 0.0455446	total: 499ms	remaining: 411ms
548:	learn: 0.0455091	total: 500ms	remaining: 410ms
549:	learn: 0.0453779	total: 500ms	remaining: 409ms
550:	learn: 0.0453135	total: 501ms	remaining: 408ms
551:	learn: 0.0452373	total: 502ms	remaining: 407ms
552:	learn: 0.0451896	total: 503ms	remaining: 407ms
553:	learn: 0.0451336	total: 504ms	remaining: 406ms
554:	learn: 0.0450963	total: 505ms	remaining: 405ms
555:	learn: 

738:	learn: 0.0339508	total: 651ms	remaining: 230ms
739:	learn: 0.0339051	total: 652ms	remaining: 229ms
740:	learn: 0.0337977	total: 653ms	remaining: 228ms
741:	learn: 0.0337262	total: 654ms	remaining: 227ms
742:	learn: 0.0336437	total: 655ms	remaining: 227ms
743:	learn: 0.0336181	total: 656ms	remaining: 226ms
744:	learn: 0.0335631	total: 657ms	remaining: 225ms
745:	learn: 0.0334956	total: 658ms	remaining: 224ms
746:	learn: 0.0334728	total: 659ms	remaining: 223ms
747:	learn: 0.0334231	total: 660ms	remaining: 222ms
748:	learn: 0.0333680	total: 660ms	remaining: 221ms
749:	learn: 0.0333183	total: 661ms	remaining: 220ms
750:	learn: 0.0332963	total: 662ms	remaining: 220ms
751:	learn: 0.0332752	total: 663ms	remaining: 219ms
752:	learn: 0.0332539	total: 664ms	remaining: 218ms
753:	learn: 0.0332047	total: 665ms	remaining: 217ms
754:	learn: 0.0331600	total: 665ms	remaining: 216ms
755:	learn: 0.0331242	total: 666ms	remaining: 215ms
756:	learn: 0.0330823	total: 667ms	remaining: 214ms
757:	learn: 

934:	learn: 0.0257680	total: 812ms	remaining: 56.4ms
935:	learn: 0.0257509	total: 813ms	remaining: 55.6ms
936:	learn: 0.0256658	total: 814ms	remaining: 54.7ms
937:	learn: 0.0255825	total: 815ms	remaining: 53.8ms
938:	learn: 0.0255025	total: 815ms	remaining: 53ms
939:	learn: 0.0254819	total: 816ms	remaining: 52.1ms
940:	learn: 0.0254251	total: 817ms	remaining: 51.2ms
941:	learn: 0.0253984	total: 818ms	remaining: 50.4ms
942:	learn: 0.0253600	total: 819ms	remaining: 49.5ms
943:	learn: 0.0253381	total: 820ms	remaining: 48.6ms
944:	learn: 0.0253212	total: 821ms	remaining: 47.8ms
945:	learn: 0.0252904	total: 821ms	remaining: 46.9ms
946:	learn: 0.0252695	total: 822ms	remaining: 46ms
947:	learn: 0.0252559	total: 823ms	remaining: 45.1ms
948:	learn: 0.0252279	total: 824ms	remaining: 44.3ms
949:	learn: 0.0252054	total: 825ms	remaining: 43.4ms
950:	learn: 0.0251818	total: 826ms	remaining: 42.5ms
951:	learn: 0.0251744	total: 826ms	remaining: 41.7ms
952:	learn: 0.0250967	total: 827ms	remaining: 40.8

219:	learn: 0.1016542	total: 204ms	remaining: 723ms
220:	learn: 0.1012888	total: 205ms	remaining: 723ms
221:	learn: 0.1011228	total: 206ms	remaining: 722ms
222:	learn: 0.1005864	total: 207ms	remaining: 721ms
223:	learn: 0.1003245	total: 208ms	remaining: 720ms
224:	learn: 0.1002046	total: 209ms	remaining: 719ms
225:	learn: 0.0999594	total: 209ms	remaining: 717ms
226:	learn: 0.0994887	total: 210ms	remaining: 716ms
227:	learn: 0.0993179	total: 211ms	remaining: 715ms
228:	learn: 0.0993116	total: 212ms	remaining: 713ms
229:	learn: 0.0990281	total: 213ms	remaining: 712ms
230:	learn: 0.0985471	total: 213ms	remaining: 710ms
231:	learn: 0.0984276	total: 214ms	remaining: 709ms
232:	learn: 0.0982127	total: 215ms	remaining: 708ms
233:	learn: 0.0980509	total: 216ms	remaining: 707ms
234:	learn: 0.0979178	total: 217ms	remaining: 706ms
235:	learn: 0.0973869	total: 218ms	remaining: 705ms
236:	learn: 0.0971586	total: 219ms	remaining: 704ms
237:	learn: 0.0970204	total: 220ms	remaining: 703ms
238:	learn: 

398:	learn: 0.0618108	total: 361ms	remaining: 543ms
399:	learn: 0.0616744	total: 362ms	remaining: 543ms
400:	learn: 0.0615772	total: 363ms	remaining: 542ms
401:	learn: 0.0614963	total: 363ms	remaining: 541ms
402:	learn: 0.0613988	total: 365ms	remaining: 540ms
403:	learn: 0.0612676	total: 365ms	remaining: 539ms
404:	learn: 0.0611924	total: 366ms	remaining: 538ms
405:	learn: 0.0610919	total: 367ms	remaining: 537ms
406:	learn: 0.0610238	total: 368ms	remaining: 536ms
407:	learn: 0.0608666	total: 369ms	remaining: 535ms
408:	learn: 0.0607348	total: 370ms	remaining: 534ms
409:	learn: 0.0605029	total: 371ms	remaining: 533ms
410:	learn: 0.0604046	total: 371ms	remaining: 532ms
411:	learn: 0.0601301	total: 372ms	remaining: 531ms
412:	learn: 0.0598923	total: 373ms	remaining: 530ms
413:	learn: 0.0598201	total: 374ms	remaining: 530ms
414:	learn: 0.0597406	total: 375ms	remaining: 529ms
415:	learn: 0.0596630	total: 376ms	remaining: 528ms
416:	learn: 0.0595875	total: 377ms	remaining: 527ms
417:	learn: 

598:	learn: 0.0417350	total: 524ms	remaining: 351ms
599:	learn: 0.0416609	total: 525ms	remaining: 350ms
600:	learn: 0.0416013	total: 526ms	remaining: 349ms
601:	learn: 0.0415477	total: 527ms	remaining: 348ms
602:	learn: 0.0413957	total: 527ms	remaining: 347ms
603:	learn: 0.0413390	total: 528ms	remaining: 346ms
604:	learn: 0.0412442	total: 529ms	remaining: 345ms
605:	learn: 0.0411817	total: 530ms	remaining: 345ms
606:	learn: 0.0410990	total: 531ms	remaining: 344ms
607:	learn: 0.0410031	total: 532ms	remaining: 343ms
608:	learn: 0.0409648	total: 532ms	remaining: 342ms
609:	learn: 0.0409097	total: 533ms	remaining: 341ms
610:	learn: 0.0407933	total: 534ms	remaining: 340ms
611:	learn: 0.0407633	total: 535ms	remaining: 339ms
612:	learn: 0.0406317	total: 536ms	remaining: 338ms
613:	learn: 0.0405740	total: 537ms	remaining: 337ms
614:	learn: 0.0404556	total: 538ms	remaining: 337ms
615:	learn: 0.0403482	total: 539ms	remaining: 336ms
616:	learn: 0.0402640	total: 539ms	remaining: 335ms
617:	learn: 

804:	learn: 0.0296170	total: 689ms	remaining: 167ms
805:	learn: 0.0295897	total: 690ms	remaining: 166ms
806:	learn: 0.0295289	total: 691ms	remaining: 165ms
807:	learn: 0.0294722	total: 692ms	remaining: 164ms
808:	learn: 0.0293848	total: 693ms	remaining: 164ms
809:	learn: 0.0293450	total: 694ms	remaining: 163ms
810:	learn: 0.0293196	total: 694ms	remaining: 162ms
811:	learn: 0.0292967	total: 695ms	remaining: 161ms
812:	learn: 0.0292614	total: 696ms	remaining: 160ms
813:	learn: 0.0292205	total: 697ms	remaining: 159ms
814:	learn: 0.0291444	total: 698ms	remaining: 158ms
815:	learn: 0.0291207	total: 699ms	remaining: 158ms
816:	learn: 0.0291037	total: 699ms	remaining: 157ms
817:	learn: 0.0290751	total: 700ms	remaining: 156ms
818:	learn: 0.0290446	total: 701ms	remaining: 155ms
819:	learn: 0.0290297	total: 702ms	remaining: 154ms
820:	learn: 0.0289521	total: 703ms	remaining: 153ms
821:	learn: 0.0289188	total: 704ms	remaining: 152ms
822:	learn: 0.0288777	total: 705ms	remaining: 152ms
823:	learn: 

SVM mean absolute error:  0.21556857131932508
Linear Regression mean absolute error:  31574708513.996727
Random Forest mean absolute error:  0.04142647058823548
XGBoost mean absolute error:  0.04478300510618069
Linear Regression mean absolute error:  31574708513.996727
GradientBoosting Regressor mean absolute error:  0.07065985749796871
Decision Tree Regressor mean absolute error:  0.029264705882353144
Cat Boost Regressor mean absolute error:  0.04878898138180454


In [None]:
def fit_and_evaluate(reg, param) -> list:
    reg.set_params(**param)
    reg.fit(X_train, y_train)

    y_predict = reg.predict(X_test)
    r2 = r2_score(y_test, y_predict)
    mae = mean_absolute_error(y_test, y_predict)
    mse = mean_squared_error(y_test, y_predict)
    rmse = mean_squared_error(y_test, y_predict, squared=False)
        
    return [reg.__class__.__name__, r2, mae, mse, rmse, param.keys(), param.values()]

In [None]:
from sklearn.metrics import (mean_absolute_error, mean_squared_error, r2_score)
import random 
import sys

RANDOM_SEED = 12345
random.seed(RANDOM_SEED)

list_grid = []

reg = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-3)
print(reg.__class__.__name__)
for max_depth in np.arange(3, 16, 1):
        sys.stdout.write(f"\rmax_depth: {max_depth}/15...")
        for n_estimators in np.arange(5, 20, 1):
                param = {"max_depth": max_depth,
                        "n_estimators": n_estimators}
                list_grid.append(fit_and_evaluate(reg, param))

                
reg = XGBRegressor(random_state=RANDOM_SEED, n_jobs=-3, verbosity=0)
print()
print(reg.__class__.__name__)
for max_depth in np.arange(3, 11, 1):
        sys.stdout.write(f"\rmax_depth: {max_depth}/10...")
        for n_estimators in [10, 20, 50, 100]:
                for learning_rate in [0.1, 0.2, 0.3]:
                        for subsample in np.arange(0.4, 1.01, 0.1):
                                for colsample_bytree in np.arange(0.4, 1.01, 0.1): 
                                        param = {"max_depth": max_depth,
                                                "n_estimators": n_estimators,
                                                "learning_rate": learning_rate,
                                                "subsample": subsample, 
                                                "colsample_bytree": colsample_bytree}
                                        list_grid.append(fit_and_evaluate(reg, param))  

In [None]:
reg_df = pd.DataFrame(data=list_grid, columns=["reg", "R2", "MAE", "MSE", "RMSE", "param.keys", "param.values"], index=None)
reg_df.sort_values(by=["R2"], ascending=False, inplace=True)

In [None]:
reg_df[reg_df['reg'] == "RandomForestRegressor"].head(10)

In [None]:
reg_df[reg_df['reg'] == "XGBRegressor"].head(10)

In [None]:
xgb_df = reg_df[reg_df['reg'] == "XGBRegressor"].head(10)
xgb_df['param.keys'] = xgb_df['param.keys'].apply(lambda x: list(x))
xgb_df['param.values'] = xgb_df['param.values'].apply(lambda x: list(x))

parameters_list = []
for keys,values in zip(xgb_df['param.keys'],xgb_df['param.values']):
    reg = XGBRegressor(random_state=RANDOM_SEED, n_jobs=-3, verbosity=0, **dict(zip(keys,values)))
    reg.fit(X_train, y_train) 
    y_pred = reg.predict(X_test)
    print(dict(zip(keys,values)))
    print(r2_score(y_test, y_pred))
    print()

In [None]:
reg = XGBRegressor(random_state=RANDOM_SEED, n_jobs=-3, verbosity=0, max_depth=9, n_estimators=100, learning_rate=0.2, subsample=0.6, colsample_bytree=0.9999999999999999)
reg.fit(X_train, y_train) 
y_pred = reg.predict(X_test)

r2_score(y_test, y_pred)

In [None]:
top_features = pd.Series(reg.feature_importances_, index = matches_df.drop('Effectiveness',axis = 1).columns).sort_values()
top_features.plot(kind="barh", figsize=(15, 10), title="Top Features")
plt.show()

In [None]:
rf_df = reg_df[reg_df['reg'] == "RandomForestRegressor"].head(10)
rf_df['param.keys'] = rf_df['param.keys'].apply(lambda x: list(x))
rf_df['param.values'] = rf_df['param.values'].apply(lambda x: list(x))

parameters_list = []
for keys,values in zip(rf_df['param.keys'],rf_df['param.values']):
    reg = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs=-3, **dict(zip(keys,values)))
    reg.fit(X_train, y_train) 
    y_pred = reg.predict(X_test)
    print(dict(zip(keys,values)))
    print(r2_score(y_test, y_pred))
    print()

In [None]:
reg = RandomForestRegressor(random_state=RANDOM_SEED, n_jobs = -3, max_depth = 8, n_estimators = 9)
reg.fit(X_train, y_train) 
y_pred = reg.predict(X_test)

r2_score(y_test, y_pred)

In [None]:
top_features = pd.Series(reg.feature_importances_, index = matches_df.drop('Effectiveness',axis = 1).columns).sort_values()
top_features.plot(kind="barh", figsize=(15, 10), title="Top Features")
plt.show()

In [None]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

grid_search = GridSearchCV(DecisionTreeRegressor(), param_grid, cv=5)

grid_search.fit(X_train, y_train)

best_params = grid_search.best_params_
print("Best Hyperparameters:", best_params)

In [None]:
reg = DecisionTreeRegressor(random_state=RANDOM_SEED, **best_params)
reg.fit(X_train, y_train) 
y_pred = reg.predict(X_test)

r2_score(y_test, y_pred)

In [None]:
top_features = pd.Series(reg.feature_importances_, index = matches_df.drop('Effectiveness',axis = 1).columns).sort_values()
top_features.plot(kind="barh", figsize=(15, 10), title="Top Features")
plt.show()

In [None]:
parameters = {'depth' : [6,8,10],'learning_rate' : [0.01, 0.05, 0.1], 'iterations' : [30, 50, 100]}
grid = GridSearchCV(estimator = cb.CatBoostRegressor(), param_grid = parameters, cv = 2, n_jobs=-1)
grid.fit(X_train, y_train)

print(" Results from Grid Search " )
print("\n The best estimator across ALL searched params:\n", grid.best_estimator_)
print("\n The best score across ALL searched params:\n", grid.best_score_)
print("\n The best parameters across ALL searched params:\n", grid.best_params_)

In [None]:
reg = cb.CatBoostRegressor(random_state=RANDOM_SEED, **grid.best_params_)
reg.fit(X_train, y_train) 
y_pred = reg.predict(X_test)

r2_score(y_test, y_pred)

In [None]:
top_features = pd.Series(reg.feature_importances_, index = matches_df.drop('Effectiveness',axis = 1).columns).sort_values()
top_features.plot(kind="barh", figsize=(15, 10), title="Top Features")
plt.show()

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from scikeras.wrappers import KerasRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

matches_df.to_csv('matches_df.csv')

dataframe = pd.read_csv("matches_df.csv", header=None)
dataframe = dataframe[1:]
dataset = dataframe.values

X = dataset[:,[i for i in range(25) if i!=13]]
Y = dataset[:,13]

def baseline_model():
    model = Sequential()
    model.add(Dense(24, input_shape=(24,), kernel_initializer='normal', activation='relu'))
    model.add(Dense(1, kernel_initializer='normal'))
    model.compile(loss='mean_squared_error', optimizer='adam')
    return model

estimator = KerasRegressor(model=baseline_model, epochs=100, batch_size=5, verbose=0)
kfold = KFold(n_splits=10)
results = cross_val_score(estimator, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print("Baseline: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [None]:
from sklearn.pipeline import Pipeline
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('mlp', KerasRegressor(model=baseline_model, epochs=50, batch_size=5, verbose=0)))
pipeline = Pipeline(estimators)
kfold = KFold(n_splits=10)
results = cross_val_score(pipeline, X, Y, cv=kfold, scoring='neg_mean_squared_error')
print("Standardized: %.2f (%.2f) MSE" % (results.mean(), results.std()))

In [None]:
dataframe = pd.read_csv("matches_df.csv", header=None)
dataframe = dataframe[1:]
dataset = dataframe.values

X = dataset[:,[i for i in range(25) if i!=13]]
y = dataset[:,13]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.metrics import r2_score
from keras.optimizers import Adam
import keras
from keras.callbacks import EarlyStopping
import pandas as pd 

model = Sequential()
model.add(Dense(128, activation="relu", input_dim=6))
model.add(Dense(32, activation="relu"))
model.add(Dense(8, activation="relu"))

model.add(Dense(1, activation="linear"))

model.compile(loss='mean_squared_error', optimizer=Adam(lr=1e-3, decay=1e-3 / 200))

es = EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=200)

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10000000, batch_size=100, verbose=2, callbacks=[es])

PredTestSet = model.predict(X1)
PredValSet = model.predict(X2)

numpy.savetxt("trainresults.csv", PredTestSet, delimiter=",")
numpy.savetxt("valresults.csv", PredValSet, delimiter=",")