In [42]:
#importing library
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf

In [43]:
#loading the dataset
df=pd.read_csv('/kaggle/input/womens-international-football-results/results.csv')
#showing the dataset
df

Unnamed: 0,date,home_team,away_team,home_score,away_score,tournament,city,country,neutral
0,1956-09-23,Germany,Netherlands,2,1,Friendly,Essen,Germany,False
1,1957-07-28,Germany,England,1,1,Friendly,Stuttgart,Germany,False
2,1957-10-13,Germany,Netherlands,2,0,Friendly,Berlin,Germany,False
3,1957-11-03,Netherlands,Austria,8,1,European Championship,Berlin,Germany,True
4,1957-11-03,Germany,England,0,4,European Championship,Berlin,Germany,False
...,...,...,...,...,...,...,...,...,...
9677,2024-08-04,Thailand,Taiwan,2,1,Friendly,Bangkok,Thailand,False
9678,2024-08-06,United States,Germany,1,0,Olympic Games,Décines-Charpieu,France,True
9679,2024-08-06,Brazil,Spain,4,2,Olympic Games,Marseille,France,True
9680,2024-08-09,Spain,Germany,0,1,Olympic Games,Décines-Charpieu,France,True


In [44]:
#getting preliminary information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9682 entries, 0 to 9681
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   date        9682 non-null   object
 1   home_team   9682 non-null   object
 2   away_team   9682 non-null   object
 3   home_score  9682 non-null   int64 
 4   away_score  9682 non-null   int64 
 5   tournament  9682 non-null   object
 6   city        9681 non-null   object
 7   country     9681 non-null   object
 8   neutral     9682 non-null   bool  
dtypes: bool(1), int64(2), object(6)
memory usage: 614.7+ KB


# Feature Engineering

In [45]:
df['year']=df['date'].apply(lambda x:x.split('-')[0]).astype(int)
df['month']=df['date'].apply(lambda x:x.split('-')[1]).astype(int)
df['day']=df['date'].apply(lambda x:x.split('-')[2]).astype(int)

In [46]:
df=df.drop('date',axis=1)

In [47]:
df

Unnamed: 0,home_team,away_team,home_score,away_score,tournament,city,country,neutral,year,month,day
0,Germany,Netherlands,2,1,Friendly,Essen,Germany,False,1956,9,23
1,Germany,England,1,1,Friendly,Stuttgart,Germany,False,1957,7,28
2,Germany,Netherlands,2,0,Friendly,Berlin,Germany,False,1957,10,13
3,Netherlands,Austria,8,1,European Championship,Berlin,Germany,True,1957,11,3
4,Germany,England,0,4,European Championship,Berlin,Germany,False,1957,11,3
...,...,...,...,...,...,...,...,...,...,...,...
9677,Thailand,Taiwan,2,1,Friendly,Bangkok,Thailand,False,2024,8,4
9678,United States,Germany,1,0,Olympic Games,Décines-Charpieu,France,True,2024,8,6
9679,Brazil,Spain,4,2,Olympic Games,Marseille,France,True,2024,8,6
9680,Spain,Germany,0,1,Olympic Games,Décines-Charpieu,France,True,2024,8,9


In [48]:
df['home_victory']=(df['home_score']>df['away_score']).astype(int)

In [49]:
df=df.drop(['home_score','away_score'],axis=1)

In [50]:
df

Unnamed: 0,home_team,away_team,tournament,city,country,neutral,year,month,day,home_victory
0,Germany,Netherlands,Friendly,Essen,Germany,False,1956,9,23,1
1,Germany,England,Friendly,Stuttgart,Germany,False,1957,7,28,0
2,Germany,Netherlands,Friendly,Berlin,Germany,False,1957,10,13,1
3,Netherlands,Austria,European Championship,Berlin,Germany,True,1957,11,3,1
4,Germany,England,European Championship,Berlin,Germany,False,1957,11,3,0
...,...,...,...,...,...,...,...,...,...,...
9677,Thailand,Taiwan,Friendly,Bangkok,Thailand,False,2024,8,4,1
9678,United States,Germany,Olympic Games,Décines-Charpieu,France,True,2024,8,6,1
9679,Brazil,Spain,Olympic Games,Marseille,France,True,2024,8,6,1
9680,Spain,Germany,Olympic Games,Décines-Charpieu,France,True,2024,8,9,0


In [51]:
df['neutral']=df['neutral'].astype(int)

In [52]:
#encoding
def onehot_encode(df,columns):
    df=df.copy()
    for column in columns:
        dummies=pd.get_dummies(df[column],prefix=column)
        df=pd.concat([df,dummies],axis=1)
        df=df.drop(column,axis=1)
    return df


In [53]:
df=onehot_encode(df,['home_team','away_team','tournament','city','country'])

In [54]:
df

Unnamed: 0,neutral,year,month,day,home_victory,home_team_Afghanistan,home_team_Albania,home_team_Algeria,home_team_American Samoa,home_team_Andorra,...,country_United Arab Emirates,country_United States,country_United States Virgin Islands,country_Uruguay,country_Uzbekistan,country_Venezuela,country_Vietnam,country_Wales,country_Zambia,country_Zimbabwe
0,0,1956,9,23,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,0,1957,7,28,0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,0,1957,10,13,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,1,1957,11,3,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,0,1957,11,3,0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9677,0,2024,8,4,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9678,1,2024,8,6,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9679,1,2024,8,6,1,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9680,1,2024,8,9,0,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [58]:
for column in df.columns:
    if df[column].dtype=='bool':
        df[column].astype('int')

In [59]:
df

Unnamed: 0,neutral,year,month,day,home_victory,home_team_Afghanistan,home_team_Albania,home_team_Algeria,home_team_American Samoa,home_team_Andorra,...,country_United Arab Emirates,country_United States,country_United States Virgin Islands,country_Uruguay,country_Uzbekistan,country_Venezuela,country_Vietnam,country_Wales,country_Zambia,country_Zimbabwe
0,0,1956,9,23,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,1957,7,28,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,1957,10,13,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,1957,11,3,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,1957,11,3,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9677,0,2024,8,4,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9678,1,2024,8,6,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9679,1,2024,8,6,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9680,1,2024,8,9,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [60]:
#Splitting the dataset

y=df['home_victory']
x=df.drop('home_victory',axis=1)

In [63]:
#Splitting the data  between train and test split
x_train,x_test,y_train,y_test=train_test_split(x,y,train_size=0.7,random_state=123)

In [67]:
#scaling the dataset
scaler=StandardScaler()
scaler.fit(x_train)
x_train=pd.DataFrame(scaler.transform(x_train),columns=x_train.columns,index=x_train.index)
x_test=pd.DataFrame(scaler.transform(x_test),columns=x_test.columns,index=x_test.index)

In [69]:
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(6777, 2420)
(2905, 2420)
(6777,)
(2905,)


In [81]:
#training the model


inputs=tf.keras.Input(shape=(2420,))
x=tf.keras.layers.Dense(32,activation='relu')(inputs)
x=tf.keras.layers.Dense(32,activation='relu')(x)
outputs=tf.keras.layers.Dense(1,activation='sigmoid')(x)

model=tf.keras.Model(inputs=inputs,outputs=outputs)

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy',tf.keras.metrics.AUC(name='auc')],
                 )
batch_size=32
epochs=50


history=model.fit(x_train,y_train,validation_split=.20,batch_size=batch_size,epochs=epochs,callbacks=[tf.keras.callbacks.ReduceLROnPlateau()])

Epoch 1/50
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.5703 - auc: 0.5699 - loss: 0.7092 - val_accuracy: 0.6202 - val_auc: 0.6573 - val_loss: 0.6630 - learning_rate: 0.0010
Epoch 2/50
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.7773 - auc: 0.8652 - loss: 0.4965 - val_accuracy: 0.6372 - val_auc: 0.6820 - val_loss: 0.7042 - learning_rate: 0.0010
Epoch 3/50
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8404 - auc: 0.9216 - loss: 0.3732 - val_accuracy: 0.6593 - val_auc: 0.7027 - val_loss: 0.7510 - learning_rate: 0.0010
Epoch 4/50
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.8853 - auc: 0.9568 - loss: 0.2855 - val_accuracy: 0.6571 - val_auc: 0.7000 - val_loss: 0.8534 - learning_rate: 0.0010
Epoch 5/50
[1m170/170[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - accuracy: 0.9071 - auc: 0.9709 - loss: 0

In [82]:
fig=px.line(history.history,y=['loss','val_loss'],labels={'x':'Epochs','y':'Loss'},
           title='Loss Over Time')
fig.show()

In [83]:
model.evaluate(x_test,y_test)

[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6695 - auc: 0.7218 - loss: 1.6387


[1.6128195524215698, 0.6695352792739868, 0.7193713188171387]

In [78]:
y_test

4784    1
4340    1
1449    0
6072    1
4552    0
       ..
8115    1
1644    1
2453    0
7837    0
3354    0
Name: home_victory, Length: 2905, dtype: int64

In [79]:
model.predict(x_test)

[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


array([[9.9982631e-01],
       [9.9979061e-01],
       [5.5925134e-06],
       ...,
       [5.9394218e-04],
       [5.8108875e-11],
       [9.9999458e-01]], dtype=float32)