In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
pd.set_option('mode.chained_assignment', None)

In [2]:
match_statistics = pd.read_csv('All Matches.csv') 
match_scores=match_statistics[['HomeTeamName','AwayTeamName','HomeTeamGoals','AwayTeamGoals']]
match_scores.loc[match_scores['HomeTeamGoals']>match_scores['AwayTeamGoals'], 'results']= 'win'
match_scores.loc[match_scores['HomeTeamGoals']<match_scores['AwayTeamGoals'], 'results']= 'lose'
match_scores.loc[match_scores['HomeTeamGoals']==match_scores['AwayTeamGoals'], 'results']= 'draw'
match_scores.head()

Unnamed: 0,HomeTeamName,AwayTeamName,HomeTeamGoals,AwayTeamGoals,results
0,France,Yugoslavia,4,5,lose
1,Czechoslovakia,Soviet Union,0,3,lose
2,Czechoslovakia,France,2,0,win
3,Soviet Union,Yugoslavia,2,1,win
4,Spain,Hungary,2,1,win


In [3]:
general_statistics=pd.read_csv('Participated Teams General Statistics.csv',index_col=0)
national_appearance=pd.read_csv('National Teams Appearance.csv',index_col=0)
national_appearance=national_appearance[['Appearances','Record streak','Active streak']]
team_status=pd.concat([general_statistics,national_appearance],axis=1)
team_status=team_status[:-2]
team_status.head()

Unnamed: 0,Participations,Played,Win,Draw,Loss,Goal_For,Goal_Against,Goal_Difference,Points,Points/match,Appearances,Record streak,Active streak
Germany,12.0,49.0,26.0,12.0,11.0,72.0,48.0,24.0,90.0,1.84,13,13,13
France,9.0,39.0,20.0,9.0,10.0,62.0,44.0,18.0,69.0,1.77,10,8,8
Spain,10.0,40.0,19.0,11.0,10.0,55.0,36.0,19.0,68.0,1.7,11,7,7
Italy,9.0,38.0,16.0,16.0,6.0,39.0,27.0,12.0,64.0,1.68,10,7,7
Portugal,7.0,35.0,18.0,9.0,8.0,49.0,31.0,18.0,63.0,1.8,8,7,7


In [4]:
home_team_df=team_status.reindex(match_scores['HomeTeamName'])
away_team_df=team_status.reindex(match_scores['AwayTeamName'])
home_away_df=pd.concat([home_team_df.reset_index(),away_team_df.reset_index()],axis=1)
home_away_df=home_away_df.dropna()
home_away_df=home_away_df.reset_index(drop=True)
home_away_df.head()

Unnamed: 0,HomeTeamName,Participations,Played,Win,Draw,Loss,Goal_For,Goal_Against,Goal_Difference,Points,...,Draw.1,Loss.1,Goal_For.1,Goal_Against.1,Goal_Difference.1,Points.1,Points/match,Appearances,Record streak,Active streak
0,Spain,10.0,40.0,19.0,11.0,10.0,55.0,36.0,19.0,68.0,...,2.0,4.0,11.0,14.0,-3.0,8.0,1.0,4.0,2.0,2.0
1,Hungary,3.0,8.0,2.0,2.0,4.0,11.0,14.0,-3.0,8.0,...,6.0,14.0,30.0,43.0,-13.0,27.0,1.0,9.0,6.0,1.0
2,Hungary,3.0,8.0,2.0,2.0,4.0,11.0,14.0,-3.0,8.0,...,2.0,8.0,22.0,25.0,-3.0,23.0,1.35,6.0,2.0,2.0
3,Netherlands,9.0,35.0,17.0,8.0,10.0,57.0,37.0,20.0,59.0,...,3.0,8.0,14.0,20.0,-6.0,18.0,1.13,4.0,3.0,0.0
4,Belgium,5.0,17.0,7.0,2.0,8.0,22.0,25.0,-3.0,23.0,...,11.0,10.0,40.0,35.0,5.0,41.0,1.32,10.0,5.0,3.0


In [5]:
euro_cup_data=pd.concat([home_away_df,match_scores.iloc[:,-1]],axis=1).drop(['HomeTeamName','AwayTeamName'],axis=1)
euro_cup_data=euro_cup_data.dropna()
euro_cup_data.head()

Unnamed: 0,Participations,Played,Win,Draw,Loss,Goal_For,Goal_Against,Goal_Difference,Points,Points/match,...,Loss.1,Goal_For.1,Goal_Against.1,Goal_Difference.1,Points.1,Points/match.1,Appearances,Record streak,Active streak,results
0,10.0,40.0,19.0,11.0,10.0,55.0,36.0,19.0,68.0,1.7,...,4.0,11.0,14.0,-3.0,8.0,1.0,4.0,2.0,2.0,lose
1,3.0,8.0,2.0,2.0,4.0,11.0,14.0,-3.0,8.0,1.0,...,14.0,30.0,43.0,-13.0,27.0,1.0,9.0,6.0,1.0,lose
2,3.0,8.0,2.0,2.0,4.0,11.0,14.0,-3.0,8.0,1.0,...,8.0,22.0,25.0,-3.0,23.0,1.35,6.0,2.0,2.0,win
3,9.0,35.0,17.0,8.0,10.0,57.0,37.0,20.0,59.0,1.69,...,8.0,14.0,20.0,-6.0,18.0,1.13,4.0,3.0,0.0,win
4,5.0,17.0,7.0,2.0,8.0,22.0,25.0,-3.0,23.0,1.35,...,10.0,40.0,35.0,5.0,41.0,1.32,10.0,5.0,3.0,win


In [6]:
scores_temp=euro_cup_data.iloc[:,:-1]
euro_cup_normal=(scores_temp - scores_temp.min()) / (scores_temp.max() - scores_temp.min())
euro_cup_normal.head()

Unnamed: 0,Participations,Played,Win,Draw,Loss,Goal_For,Goal_Against,Goal_Difference,Points,Points/match,...,Draw.1,Loss.1,Goal_For.1,Goal_Against.1,Goal_Difference.1,Points.1,Points/match.1,Appearances,Record streak,Active streak
0,0.818182,0.804348,0.730769,0.6875,0.692308,0.760563,0.733333,0.864865,0.752809,0.820359,...,0.125,0.230769,0.140845,0.276596,0.27027,0.078652,0.401198,0.25,0.083333,0.153846
1,0.181818,0.108696,0.076923,0.125,0.230769,0.140845,0.244444,0.27027,0.078652,0.401198,...,0.375,1.0,0.408451,0.893617,0.0,0.292135,0.401198,0.666667,0.416667,0.076923
2,0.181818,0.108696,0.076923,0.125,0.230769,0.140845,0.244444,0.27027,0.078652,0.401198,...,0.125,0.538462,0.295775,0.510638,0.27027,0.247191,0.610778,0.416667,0.083333,0.153846
3,0.727273,0.695652,0.653846,0.5,0.692308,0.788732,0.755556,0.891892,0.651685,0.814371,...,0.1875,0.538462,0.183099,0.404255,0.189189,0.191011,0.479042,0.25,0.166667,0.0
4,0.363636,0.304348,0.269231,0.125,0.538462,0.295775,0.488889,0.27027,0.247191,0.610778,...,0.6875,0.692308,0.549296,0.723404,0.486486,0.449438,0.592814,0.75,0.333333,0.230769


In [17]:
x=euro_cup_normal
y=euro_cup_data['results']
team1='Italy'
team2='Austria'
predict=pd.concat([home_team_df.loc[team1].iloc[0],home_team_df.loc[team2].iloc[0]])
predict_normal=(predict - predict.min()) / (predict.max() - predict.min())

In [15]:
model_1=MLPClassifier(max_iter=3000)
model_1.fit(x,y)
prob_model_1=model_1.predict_proba(np.atleast_2d(predict_normal))
#print('比赛结果：%s' % (model_1.predict(np.atleast_2d(predict))[0]))#模型预测
print('赢球概率：%.3f' % (prob_model_1[0][0]))
print('平局概率：%.3f' % (prob_model_1[0][1]))
print('输球概率：%.3f' % (prob_model_1[0][2]))
print('交叉验证正确率：%.3f' % (np.mean(cross_val_score(model_1,x,y,cv=5))))#5折交叉验证

赢球概率：0.329
平局概率：0.362
输球概率：0.309
交叉验证正确率：0.389


In [18]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30, random_state=42)#前70%划分为训练集
logreg = LogisticRegression(max_iter=3000)
logreg.fit(x_train, y_train)
score = logreg.score(x_train, y_train)#计算正确率
score2 = logreg.score(x_test, y_test)

print('训练正确率：%.3f' % (score))
print('测试正确率：%.3f' % (score2))

prob=logreg.predict_proba(np.atleast_2d(predict_normal))#计算各个标签的概率
print('%s vs' %(team1),'%s' % (team2))
print('赢球概率：%.3f' % (prob[0][0]))
print('平局概率：%.3f' % (prob[0][1]))
print('输球概率：%.3f' % (prob[0][2]))

训练正确率：0.485
测试正确率：0.528
Italy vs Austria
赢球概率：0.228
平局概率：0.326
输球概率：0.445
