In [17]:
# Importing the libraries
import pandas as pd
import numpy as np
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [2]:
# Importing the dataset
penaltyData = pd.read_csv('C:\Rishi\Manipal\Assignment\ML1_penalty_data.csv', encoding="Latin")

In [3]:
# Understanding dataset
penaltyData.shape
penaltyData.head()

Unnamed: 0,No.,Match Week,Date,Player,Team,Versus,Match,Time of Penalty Awarded,Scored,Foot,Kick_Direction,Keeper_Direction
0,1,1,13-Aug-16,Riyad Mahrez,Leicester,Hull,Hull vs Leicester,47' minute,Scored,L,C,R
1,2,1,13-Aug-16,Sergio Agüero,Man City,Sunderland,Man City vs Sunderland,4' minute,Scored,R,L,L
2,3,1,14-Aug-16,Theo Walcott,Arsenal,Liverpool,Arsenal vs Liverpool,30' minute,Missed,R,L,L
3,4,1,15-Aug-16,Eden Hazard,Chelsea,West Ham,Chelsea vs West Ham,47' minute,Scored,R,C,L
4,5,2,19-Aug-16,Zlatan Ibrahimovic,Man United,Southampton,Man United vs Southampton,52' minute,Scored,R,L,R


In [4]:
# Checking any null entry in dataset
penaltyData.isnull().sum(axis=0)

No.                        0
Match Week                 0
Date                       0
Player                     0
Team                       0
Versus                     0
Match                      0
Time of Penalty Awarded    0
Scored                     0
Foot                       0
Kick_Direction             0
Keeper_Direction           0
dtype: int64

In [5]:
# Print the number of L,R,C entry of keeper direction

print(len(penaltyData[penaltyData.Keeper_Direction == 'L']))
print(len(penaltyData[penaltyData.Keeper_Direction == 'R']))
print(len(penaltyData[penaltyData.Keeper_Direction == 'C']))

45
52
6


In [6]:
# remove row where keeper direction is Center as very less data is there
penaltyData = penaltyData[penaltyData.Keeper_Direction != 'C']

In [7]:
# Printing columns of dataset
penaltyData.columns

Index(['No.', 'Match Week', 'Date', 'Player', 'Team', 'Versus', 'Match',
       'Time of Penalty Awarded', 'Scored', 'Foot', 'Kick_Direction',
       'Keeper_Direction'],
      dtype='object')

In [8]:
# Convert string value of keeper direction into Number

penaltyData.Keeper_Direction[penaltyData.Keeper_Direction == 'L'] = 0
penaltyData.Keeper_Direction[penaltyData.Keeper_Direction == 'R'] = 1
penaltyData.Keeper_Direction = penaltyData.Keeper_Direction.astype('int')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [9]:
# Creating dummy variables
cat_vars=['Foot', 'Kick_Direction', 'Scored']
for var in cat_vars:
    cat_list='var'+'_'+var
    cat_list = pd.get_dummies(penaltyData[var], prefix=var)
    penaltyData1 = penaltyData.join(cat_list)
    penaltyData = penaltyData1

cat_vars=['Foot', 'Kick_Direction', 'Scored']
data_vars = penaltyData.columns.values.tolist()
to_keep=[i for i in data_vars if i not in cat_vars]

In [10]:
data_final=penaltyData[to_keep]
data_final.columns.values

array(['No.', 'Match Week', 'Date', 'Player', 'Team', 'Versus', 'Match',
       'Time of Penalty Awarded', 'Keeper_Direction', 'Foot_L', 'Foot_R',
       'Kick_Direction_C', 'Kick_Direction_L', 'Kick_Direction_R',
       'Scored_Missed', 'Scored_Scored'], dtype=object)

In [11]:
# Removing not necessart data from dataset
data_final=data_final.drop(['No.', 'Match Week', 'Date', 'Player', 'Team', 'Versus', 'Match', 'Time of Penalty Awarded'], axis =1)

In [12]:
# Printing columns of dataset after dropping unnecessary column
data_final.columns

Index(['Keeper_Direction', 'Foot_L', 'Foot_R', 'Kick_Direction_C',
       'Kick_Direction_L', 'Kick_Direction_R', 'Scored_Missed',
       'Scored_Scored'],
      dtype='object')

In [13]:
#Split into independent and dependent variable
X=data_final[['Foot_L', 'Foot_R', 'Kick_Direction_C', 'Kick_Direction_L', 'Kick_Direction_R', 'Scored_Missed', 'Scored_Scored']]
y=data_final.Keeper_Direction

In [14]:
# Splitting the dataset into the Training set and Test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [15]:
# Compute with DecisionTreeClassifier

from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(max_depth=3)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

print('Accuracy = {:.2f}'.format(classifier.score(X_test,y_test)))

Accuracy = 0.64


In [18]:
# Calculating accuracy of model

print(confusion_matrix(y_test,y_pred))  
print(classification_report(y_test,y_pred))  
print(accuracy_score(y_test, y_pred))  

[[9 3]
 [6 7]]
              precision    recall  f1-score   support

           0       0.60      0.75      0.67        12
           1       0.70      0.54      0.61        13

   micro avg       0.64      0.64      0.64        25
   macro avg       0.65      0.64      0.64        25
weighted avg       0.65      0.64      0.64        25

0.64
