In [5]:
import numpy as np
import pandas as pd
import seaborn as sns 
import pandas_profiling as pp
from matplotlib import pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler

In [6]:
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')
sample_submission = pd.read_csv('datasets/sample_submission.csv')

In [7]:
pp.ProfileReport(train)



In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24402 entries, 0 to 24401
Data columns (total 18 columns):
match_event_id      23165 non-null float64
location_x          23246 non-null float64
location_y          23158 non-null float64
remaining_min       24334 non-null float64
power_of_shot       24345 non-null float64
knockout_match      24402 non-null float64
game_season         24402 non-null object
remaining_sec       24344 non-null float64
distance_of_shot    24325 non-null float64
is_goal             24402 non-null float64
area_of_shot        23222 non-null object
shot_basics         23119 non-null object
range_of_shot       23156 non-null object
home/away           24402 non-null object
shot_id_number      24402 non-null int64
lat/lng             24402 non-null object
type_of_shot        24402 non-null int64
match_id            24402 non-null int64
dtypes: float64(9), int64(3), object(6)
memory usage: 3.4+ MB


In [5]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 17 columns):
match_event_id      4728 non-null float64
location_x          4753 non-null float64
location_y          4762 non-null float64
remaining_min       4990 non-null float64
power_of_shot       4988 non-null float64
knockout_match      5000 non-null float64
game_season         5000 non-null object
remaining_sec       4985 non-null float64
distance_of_shot    4985 non-null float64
area_of_shot        4741 non-null object
shot_basics         4767 non-null object
range_of_shot       4746 non-null object
home/away           5000 non-null object
shot_id_number      5000 non-null int64
lat/lng             5000 non-null object
type_of_shot        5000 non-null int64
match_id            5000 non-null int64
dtypes: float64(8), int64(3), object(6)
memory usage: 664.1+ KB


##### Combining Train and Test

In [8]:
df=train.append(test,ignore_index=True)
df.tail()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,area_of_shot,distance_of_shot,game_season,home/away,is_goal,knockout_match,lat/lng,location_x,location_y,match_event_id,match_id,power_of_shot,range_of_shot,remaining_min,remaining_sec,shot_basics,shot_id_number,type_of_shot
29397,Center(C),42.0,1999-00,MANU @ IND,,1.0,"40.361408, -86.186052",-23.0,222.0,386.0,49900087,4.0,16-24 ft.,7.0,27.0,,30669,3
29398,Center(C),20.0,1999-00,MANU vs. IND,,1.0,"42.982923, -71.446094",0.0,0.0,213.0,49900088,2.0,Less Than 8 ft.,0.0,40.0,Goal Area,30681,4
29399,Left Side(L),28.0,1999-00,MANU vs. IND,,1.0,"42.982923, -71.446094",,48.0,226.0,49900088,3.0,8-16 ft.,11.0,30.0,Goal Line,30683,4
29400,Center(C),29.0,1999-00,MANU vs. IND,,1.0,"42.982923, -71.446094",16.0,93.0,268.0,49900088,3.0,8-16 ft.,5.0,37.0,Goal Line,30687,4
29401,Center(C),20.0,1999-00,MANU vs. IND,,1.0,"42.982923, -71.446094",0.0,0.0,398.0,49900088,4.0,Less Than 8 ft.,6.0,5.0,Goal Area,30694,4


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29402 entries, 0 to 29401
Data columns (total 18 columns):
area_of_shot        27963 non-null object
distance_of_shot    29310 non-null float64
game_season         29402 non-null object
home/away           29402 non-null object
is_goal             24402 non-null float64
knockout_match      29402 non-null float64
lat/lng             29402 non-null object
location_x          27999 non-null float64
location_y          27920 non-null float64
match_event_id      27893 non-null float64
match_id            29402 non-null int64
power_of_shot       29333 non-null float64
range_of_shot       27902 non-null object
remaining_min       29324 non-null float64
remaining_sec       29329 non-null float64
shot_basics         27886 non-null object
shot_id_number      29402 non-null int64
type_of_shot        29402 non-null int64
dtypes: float64(9), int64(3), object(6)
memory usage: 4.0+ MB


In [10]:
#Taking a brief look at the remaining_min
df.remaining_min.describe()

count    29324.000000
mean         4.830753
std          3.468249
min          0.000000
25%          2.000000
50%          5.000000
75%          8.000000
max         11.000000
Name: remaining_min, dtype: float64

In [11]:
#Mean seems like a good fit 
df.remaining_min.fillna(df.remaining_min.mean(),inplace=True)

In [12]:
#Taking a brief look at the power_of_shot
df.power_of_shot.describe()

count    29333.000000
mean         2.493915
std          1.171974
min          0.000000
25%          1.000000
50%          3.000000
75%          3.000000
max          6.000000
Name: power_of_shot, dtype: float64

In [13]:
#Since we have just 7 values, taking the median seems like a good choice
df.power_of_shot.fillna(4,inplace=True)

In [14]:
df.remaining_sec.describe()

count    29329.000000
mean        28.165979
std         17.592542
min          0.000000
25%         13.000000
50%         28.000000
75%         43.000000
max         59.000000
Name: remaining_sec, dtype: float64

In [15]:
#Mean and median are nealry same, so replacing missing values with median
df.remaining_sec.fillna(df.remaining_sec.median(),inplace=True)

In [16]:
df.distance_of_shot.describe()

count    29310.000000
mean        33.665643
std          9.887551
min          0.000000
25%         25.000000
50%         35.000000
75%         41.000000
max         99.000000
Name: distance_of_shot, dtype: float64

In [17]:
#Mean and median are nealry same, so replacing missing values with median
df.distance_of_shot.fillna(df.distance_of_shot.median(),inplace=True)

In [18]:
#Taking a brief look at the area_of_shot
df.area_of_shot.describe()

count         27963
unique            6
top       Center(C)
freq          12204
Name: area_of_shot, dtype: object

In [19]:
#Center postion occurs most frequently, so it seems fair enough to replace NaN with center 
df.area_of_shot.fillna(df.area_of_shot.describe().top,inplace=True)

In [20]:
#taking a brief look at the shot basics
df.shot_basics.describe()

count         27886
unique            7
top       Mid Range
freq          11465
Name: shot_basics, dtype: object

In [21]:
#Mid Range occurs most frequently, so it seems fair enough to replace NaN with Mid Range 
df.shot_basics.fillna(df.shot_basics.describe().top,inplace=True)

In [22]:
#taking a brief look at the Range of Shot
df.range_of_shot.describe()

count               27902
unique                  5
top       Less Than 8 ft.
freq                 8510
Name: range_of_shot, dtype: object

In [23]:
#Less than 8ft occurs most frequently, so it seems fair enough to replace NaN with Less than 8 ft
df.range_of_shot.fillna(df.range_of_shot.describe().top, inplace=True)

In [24]:
#Making two seperate columns for latitute and longitude values
lat = []
lng = []

for row in df['lat/lng']:
    try:
        lat.append(float(row.split(',')[0]))
        lng.append(float(row.split(',')[1]))
    except:
        lat.append(np.NaN)
        lng.append(np.NaN)

df['latitude'] = lat
df['longitude'] = lng

In [25]:
#Deviation is low and mean seems like a good option
df.latitude.fillna(df.latitude.mean(), inplace=True)

In [26]:
#Deviation is low and mean seems like a good option
df.longitude.fillna(df.longitude.mean(),inplace=True)

In [27]:
## Transforming the feature 'home/away' into a categorical variable having values only 'home' and 'away' :

for i in range(df.shape[0]):
    if('@' in df.loc[i,'home/away']):
        df.loc[i,'home/away']='away'
        
    if('vs' in df.loc[i,'home/away']):
        df.loc[i,'home/away']='home'

In [28]:
for col in ['area_of_shot','range_of_shot','shot_basics','home/away']:
    dummy = pd.get_dummies(df[col],prefix=col)
    df=pd.concat([df,dummy],axis=1)

In [29]:
df.drop(['match_event_id','knockout_match','area_of_shot','range_of_shot','shot_basics','match_id','lat/lng','game_season','location_x','location_y','home/away'],axis=1,inplace=True)

In [30]:
train_data = df[df['is_goal'].isnull()!=True]
train_data.drop('shot_id_number',axis=1,inplace=True)
#test_data = df[df['is_goal'].isnull()==True]
#test_data.drop(['is_goal'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [31]:
test_data = pd.merge(df,sample_submission, on=['shot_id_number'], how='inner')
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 30 columns):
distance_of_shot                      5000 non-null float64
is_goal_x                             0 non-null float64
power_of_shot                         5000 non-null float64
remaining_min                         5000 non-null float64
remaining_sec                         5000 non-null float64
shot_id_number                        5000 non-null int64
type_of_shot                          5000 non-null int64
latitude                              5000 non-null float64
longitude                             5000 non-null float64
area_of_shot_Center(C)                5000 non-null uint8
area_of_shot_Left Side Center(LC)     5000 non-null uint8
area_of_shot_Left Side(L)             5000 non-null uint8
area_of_shot_Mid Ground(MG)           5000 non-null uint8
area_of_shot_Right Side Center(RC)    5000 non-null uint8
area_of_shot_Right Side(R)            5000 non-null uint8
range_of_sho

In [32]:
test_data.drop(['is_goal_x','shot_id_number','is_goal_y'],axis=1,inplace=True)

In [33]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24402 entries, 0 to 24401
Data columns (total 28 columns):
distance_of_shot                      24402 non-null float64
is_goal                               24402 non-null float64
power_of_shot                         24402 non-null float64
remaining_min                         24402 non-null float64
remaining_sec                         24402 non-null float64
type_of_shot                          24402 non-null int64
latitude                              24402 non-null float64
longitude                             24402 non-null float64
area_of_shot_Center(C)                24402 non-null uint8
area_of_shot_Left Side Center(LC)     24402 non-null uint8
area_of_shot_Left Side(L)             24402 non-null uint8
area_of_shot_Mid Ground(MG)           24402 non-null uint8
area_of_shot_Right Side Center(RC)    24402 non-null uint8
area_of_shot_Right Side(R)            24402 non-null uint8
range_of_shot_16-24 ft.               24402 non-nul

In [34]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5000 entries, 0 to 4999
Data columns (total 27 columns):
distance_of_shot                      5000 non-null float64
power_of_shot                         5000 non-null float64
remaining_min                         5000 non-null float64
remaining_sec                         5000 non-null float64
type_of_shot                          5000 non-null int64
latitude                              5000 non-null float64
longitude                             5000 non-null float64
area_of_shot_Center(C)                5000 non-null uint8
area_of_shot_Left Side Center(LC)     5000 non-null uint8
area_of_shot_Left Side(L)             5000 non-null uint8
area_of_shot_Mid Ground(MG)           5000 non-null uint8
area_of_shot_Right Side Center(RC)    5000 non-null uint8
area_of_shot_Right Side(R)            5000 non-null uint8
range_of_shot_16-24 ft.               5000 non-null uint8
range_of_shot_24+ ft.                 5000 non-null uint8
range_of_sh

In [35]:
test_data.head(10)

Unnamed: 0,distance_of_shot,power_of_shot,remaining_min,remaining_sec,type_of_shot,latitude,longitude,area_of_shot_Center(C),area_of_shot_Left Side Center(LC),area_of_shot_Left Side(L),...,range_of_shot_Less Than 8 ft.,shot_basics_Goal Area,shot_basics_Goal Line,shot_basics_Left Corner,shot_basics_Mid Ground Line,shot_basics_Mid Range,shot_basics_Penalty Spot,shot_basics_Right Corner,home/away_away,home/away_home
0,38.0,1.0,10.0,27.0,2,45.539131,-122.651648,0,0,0,...,0,0,0,0,0,1,0,0,1,0
1,22.0,3.0,8.0,5.0,3,45.539131,-122.651648,1,0,0,...,1,1,0,0,0,0,0,0,1,0
2,20.0,1.0,0.0,1.0,4,42.982923,-71.446094,1,0,0,...,1,1,0,0,0,0,0,0,0,1
3,20.0,3.0,10.0,46.0,4,42.982923,-71.446094,1,0,0,...,1,0,0,0,0,1,0,0,0,1
4,37.0,1.0,11.0,26.0,3,49.250068,-123.114646,0,0,0,...,0,0,0,0,0,1,0,0,1,0
5,40.0,1.0,10.0,58.0,3,49.250068,-123.114646,0,0,0,...,0,0,0,0,0,1,0,0,1,0
6,21.0,1.0,7.0,33.0,3,49.250068,-123.114646,1,0,0,...,1,1,0,0,0,0,0,0,1,0
7,21.0,1.0,5.0,58.0,3,49.250068,-123.114646,1,0,0,...,1,1,0,0,0,0,0,0,1,0
8,20.0,1.0,4.0,9.0,4,49.250068,-123.114646,1,0,0,...,1,1,0,0,0,0,0,0,1,0
9,36.0,2.0,5.0,33.0,1,49.250068,-123.114646,0,1,0,...,1,0,0,0,0,1,0,0,1,0


In [36]:
X,Y = train_data.drop(['is_goal'],axis=1),train_data['is_goal']  

#### Model

In [37]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.1) # 90% training and 10% test

In [38]:
#Scaling our data
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [39]:
X_train

array([[-0.58161727, -1.28249702, -0.82154513, ..., -0.11030676,
         0.96813195, -0.96813195],
       [ 0.95920426, -1.28249702,  1.77069585, ..., -0.11030676,
         0.96813195, -0.96813195],
       [ 1.16464713,  1.28059075, -0.53351836, ..., -0.11030676,
         0.96813195, -0.96813195],
       ...,
       [ 0.24015421, -1.28249702,  0.90661552, ..., -0.11030676,
        -1.03291706,  1.03291706],
       [ 0.54831852, -0.42813443,  0.33056197, ..., -0.11030676,
         0.96813195, -0.96813195],
       [ 1.26736857, -1.28249702,  0.33056197, ..., -0.11030676,
        -1.03291706,  1.03291706]])

In [40]:
classifier = LogisticRegression(penalty='l2',dual=False,random_state=0, max_iter=1000,tol=0.01)

In [41]:
classifier.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.01, verbose=0,
                   warm_start=False)

In [42]:
#Predicting the results
y_pred_train=classifier.predict_proba(X_train)[:,0]
print ("Training score:",(1/(1+mean_absolute_error(y_train,y_pred_train))))
y_pred = classifier.predict_proba(X_test)[:,0]
print ("Training score:",(1/(1+mean_absolute_error(y_test,y_pred))))

Training score: 0.6542331673193869
Training score: 0.6540912740455426


In [55]:
y_pred

array([0.45979833, 0.44178911, 0.4594098 , ..., 0.45804999, 0.4607699 ,
       0.45882889])

In [56]:
print("Loss:",mean_absolute_error(y_test, y_pred))

Loss: 0.4931082325724794


In [57]:
print ("score:",(1/(1+mean_absolute_error(y_test,y_pred))))

score: 0.6697438123940271


In [35]:
y_pred=classifier.predict_proba(test_data)[:,-1]

In [36]:
result=pd.DataFrame({'shot_id_number':sample_submission['shot_id_number'],'is_goal':y_pred})

In [37]:
result.head(20)

Unnamed: 0,shot_id_number,is_goal
0,1,0.397004
1,8,0.5184
2,17,0.561068
3,20,0.553155
4,33,0.400129
5,34,0.468928
6,35,0.526991
7,36,0.52419
8,37,0.578941
9,38,0.440994
