# Importing Libraries

In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # linear algebra 
import math
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import xgboost
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.model_selection import RepeatedStratifiedKFold
import time
from sklearn.model_selection import cross_val_score
import pickle

# data visualization(for EDA)
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')
sns.set(color_codes=True)
import plotly.express as px
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

#ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

ModuleNotFoundError: No module named 'xgboost'

# Loading and Importing Data

In [None]:
df = pd.read_csv('../input/pakistan-super-leaguepsl-ball-by-ball-20162020/psl_formated.csv')
df.head()


# Data preprocessing (Cleaning)

In [None]:
df['wicket'].unique()

In [None]:
df['wicket_text'].unique()

In [None]:
df['wicket'].fillna(0,inplace = True)
df.head()

#### Hence, we replaced the nans in the 'wicket' column with 0.

# Boundaries Count

In [None]:
boundries_df = df.groupby(['psl_year']).agg(
    fours = ('is_four','sum'),
    sixes = ('is_six','sum'),
    matches = ('match_number','max')
    
)
boundries_df = boundries_df.reset_index()
boundries_df

In [None]:
ax = boundries_df.plot('psl_year', 'fours', kind='bar')

In [None]:
ax = boundries_df.plot('psl_year', 'sixes', kind='bar')

In [None]:
boundries_df['fours_per_match'] = boundries_df['fours']/boundries_df['matches']
boundries_df['six_per_match'] = boundries_df['sixes']/boundries_df['matches']

In [None]:
ax = boundries_df.plot('psl_year', 'fours_per_match', kind='bar')

In [None]:
ax = boundries_df.plot('psl_year', 'six_per_match', kind='bar')

#### In this section we found the following insights:
- Most fours and fours per match were scored in 2019.
- Most sixes and sixes per match were scored in 2018.

# Wickets

In [None]:
wickets = df['wicket_text'].value_counts()
wicket_dict = dict(wickets)
del_keys = []
others = 0
for key, value in wicket_dict.items():
    if value <100:
        others = others+value
        del_keys.append(key)
for del_key in del_keys:
    del(wicket_dict[del_key])
wicket_dict['others']= others
wicket_dict

In [None]:
plt.bar(*zip(*wicket_dict.items()))

#### Here, we found an interesting fact that most of the players lost their wickets by getting caught.

# Team Based Dataframes

In [None]:
teams_df = {'Quetta' : df.loc[((df['team_1'] == "Quetta Gladiators") & (df['inning'] == 1)) | ((df['team_2'] == "Quetta Gladiators") & (df['inning'] == 2))],
'Lahore' : df.loc[((df['team_1'] == "Lahore Qalandars") & (df['inning'] == 1)) | ((df['team_2'] == "Lahore Qalandars") & (df['inning'] == 2))],
'Islamabad' : df.loc[((df['team_1'] == "Islamabad United") & (df['inning'] == 1)) | ((df['team_2'] == "Islamabad United") & (df['inning'] == 2))],
'Peshawar' : df.loc[((df['team_1'] == "Peshawar Zalmi") & (df['inning'] == 1)) | ((df['team_2'] == "Peshawar Zalmi") & (df['inning'] == 2))],
'Multan' : df.loc[((df['team_1'] == "Multan Sultans") & (df['inning'] == 1)) | ((df['team_2'] == "Multan Sultans") & (df['inning'] == 2))],
'Karachi' : df.loc[((df['team_1'] == "Karachi Kings") & (df['inning'] == 1)) | ((df['team_2'] == "Karachi Kings") & (df['inning'] == 2))]
}

# Team Based Runs

In [None]:
team_runs = {}
for team_name, value in teams_df.items():
    team_runs[team_name] = value['runs'].sum()
team_matches = {}
for team_name , value in teams_df.items():
    team_matches[team_name] = value.groupby(['psl_year', 'match_number']).ngroups

In [None]:
plt.bar(*zip(*team_runs.items()))

#### Among all the teams Peshawar scored the most runs, followed by Islamabad United.

# Team Based Percentage Runs in Boundaries

In [None]:
team_boundry_runs = {}
for team_name, value in teams_df.items():
    sixes = value['is_six'].sum()
    fours = value['is_four'].sum()
    team_boundry_runs[team_name]= (sixes*6)+(fours*4)

boundry_run_percentage = {}
for team, runs in team_runs.items():
#     print (f"{team} scored {(team_boundry_runs[team]/runs)*100}% runs in boundries")
    boundry_run_percentage[team] = (team_boundry_runs[team]/runs)*100

In [None]:
plt.bar(*zip(*boundry_run_percentage.items()))

#### This bar chart shows that Islamabad United dealt more in boundaries to score the runs.

# Dot Balls Played

In [None]:
dot_balls_played = {}
total_balls_played = {}
for team, value in teams_df.items():
    dot_balls_played[team]=value[value['runs']==0].shape[0]
    total_balls_played[team]= value.shape[0]

dot_ball_percentage ={}
for team , value in dot_balls_played.items():
    dot_ball_percentage[team] = (value/total_balls_played[team])*100

# for team , value in dot_ball_percentage.items():
#     print (f"Team : {team} played {math.trunc(value)}% of dot balls")

In [None]:
plt.bar(*zip(*dot_ball_percentage.items()))

#### Here, it shows that Lahore Qalanders played the most Dot balls.

# Six Hit Per Team

In [None]:
six_per_team = {}
for team, value in teams_df.items():
    six_per_team[team] = value['is_six'].sum()

six_per_match ={}
for team ,value in six_per_team.items():
    six_per_match[team] = value/team_matches[team]

plt.bar(*zip(*six_per_match.items()))

#### Talking about sixes hit by the teams, Islamabad United placed the most sixes per match.

In [None]:
plt.bar(*zip(*six_per_team.items()))

#### Talking about sixes hit by the teams, Peshawar Zalmi placed the most sixes per team.

# Four Hit Per Team

In [None]:
four_per_team = {}
for team, value in teams_df.items():
    four_per_team[team] = value['is_four'].sum()

four_per_match ={}
for team ,value in four_per_team.items():
    four_per_match[team] = value/team_matches[team]

plt.bar(*zip(*four_per_match.items()))

#### It seems a coincidence that all the teams scored almost similar number of fours per match.

# Runs Scored Per Team

In [None]:
runs_per_team = {}
for team, value in teams_df.items():
    runs_per_team[team] = value['runs'].sum()

runs_per_match ={}
for team ,value in runs_per_team.items():
    runs_per_match[team] = value/team_matches[team]

plt.bar(*zip(*runs_per_match.items()))

#### Here, again it seems that all the teams scored almost similar number of runs per match.

# Wickets Per Team

In [None]:
wickets_per_team = {}
for team, value in teams_df.items():
    wickets_per_team[team] = value['is_wicket'].sum()

wickets_per_match ={}
for team ,value in wickets_per_team.items():
    wickets_per_match[team] = value/team_matches[team]

plt.bar(*zip(*wickets_per_team.items()))

#### The above barchart shows that, Peshawar Zalmi had the most wickets being fallen per team.

In [None]:
plt.bar(*zip(*wickets_per_match.items()))

#### Her the above barchart shows that, Lahore Qalanders had the most wickets being fallen per match.

# Defining Functions to bulit Prediction Model

In [None]:
# Method to calculate if the team batting second won or not
def is_winner(row):
    if type(row['result']) == str:
        if row['result'] in row['team_2']:
            return 1
        return 0
    else:
        return 0

In [None]:
# Method to calculate if a player was dismissed on this ball or not
def is_out(row):
    if type(row['player_dismissed']) == str:
        return 1
    else:
        return 0

# Data Manipulation to Extract Important Features

In [None]:
# Splitting on Innings
first_inning = df[df['inning']==1]
second_inning = df[df['inning']==2]

In [None]:
# Calculating Target
total_sum = first_inning.groupby(["psl_year","match_number"]).agg(
   target= ("runs","sum")
)

In [None]:
total_sum = total_sum.reset_index()

In [None]:
# Merging the DF
new_df = pd.merge(total_sum, second_inning,  how='left', left_on=['psl_year','match_number'], right_on = ['psl_year','match_number'])

In [None]:
# Calculating if Second Inning team won or not 
new_df['won'] = new_df.apply(is_winner, axis=1)

In [None]:
# Calculating balls left
new_df['balls_left'] = 120-(((new_df['over']-1)*6)+new_df['ball'])

In [None]:
# Creating the Final Dataset
final_df = new_df[['over','ball','total_runs','wickets','target','balls_left','won']]

In [None]:
final_df.head()

In [None]:
final_df['runs_left'] = final_df['target']- final_df['total_runs']

# Selecting particular columns for our model

In [None]:
new_cols =['wickets','balls_left','runs_left','won']

In [None]:
psl_df = final_df[new_cols]
psl_df.head()

In [None]:
psl_df.isnull().sum()

In [None]:
psl_df.shape

In [None]:
psl_df = psl_df.apply (pd.to_numeric, errors='coerce')
psl_df = psl_df.dropna()

In [None]:
psl_df.shape

In [None]:
# X and Y Split
y = psl_df.iloc[:,3]
X = psl_df.iloc[:,:3]

In [None]:
# Train and Test Split
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state = 42)

# Making predictions with Random Forest Classifier

In [None]:
# Training Model
start = time.time()
RF = RandomForestClassifier(n_estimators = 10000, max_depth=6, random_state=0)
RF.fit(X_train, y_train)
print(f"Time Taken to Train {time.time()-start} seconds")

In [None]:
y_pred_test = RF.predict(X_test)

In [None]:
accuracy_score(y_test, y_pred_test)

In [None]:
current= {
    "wickets":1,
    "balls_left" :105,
    "runs_left":137
}
current_df = pd.DataFrame(current,index=[0])
RF.predict_proba(current_df)

# Making predictions with XGBoost Classifier

In [None]:
XGBC = xgboost.XGBClassifier()
XGBC.fit(X_train, y_train)

In [None]:
y_pred = XGBC.predict(X_test)
predictions = [round(value) for value in y_pred]

In [None]:
# evaluate predictions
accuracy = accuracy_score(y_test, predictions)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

In [None]:
current= {
    "wickets":1,
    "balls_left" :105,
    "runs_left":137
}
current_df = pd.DataFrame(current,index=[0])
XGBC.predict_proba(current_df)

# Making predictions with SVM 

In [None]:
# Define model
SVC_model = svm.SVC()

# Fit model
SVC_model.fit(X_train, y_train)

#Predict Output 
predicted = SVC_model.predict(X_test)
a = accuracy_score(y_test,predicted)
print('The accuracy using SVC Classifier is:',format(a*100))

In [None]:
model_file = "model.sav"
with open(model_file,mode='wb') as model_f:
    pickle.dump(XGBC,model_f)

In [None]:
with open(model_file,mode='rb') as model_f:
    model = pickle.load(model_f)
    result = model.score(X_test,y_test)
    print("result:",result)