In [2]:
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
import math

In [3]:
def get_season_ratings(season):
    season_data=pd.read_csv('rosters_with_stats_'+season+'.csv',dtype={'home_goals':'int32','away_goals':'int32'},sep='\s*,\s*',engine='python')
    return season_data

In [4]:
def get_match_results(season):
    main_dir = os.getcwd()
    season_file=main_dir+r'\\match result\\match_results-'+season+'.csv'
    season_results=pd.read_csv(season_file,sep='\s*,\s*',engine='python')
    return season_results

In [5]:
def get_match_roster(season):
    rosters=pd.read_csv('match_lineup_'+season+'.csv',sep='\s*,\s*',engine='python')
    return rosters

In [6]:
'''Makes the features for each match which can then be used for training'''

def make_match_vectors(roster,ratings,results,features):
    matches=results['match_id']
    #print(results)
    goal_diff=(results['home_goals'])-(results['away_goals'])
    teams_home=roster['h_id']
    teams_away=roster['a_id']
    
    #print(roster)
    #features=[]
    for i in range(len(matches)):
        match_id=matches[i]
        print(match_id)
        if goal_diff[i]<0:
            label=-1
        elif goal_diff[i]==0:
            label=0
        else:
            label=1
            
        home_id=teams_home[i]
        away_id=teams_away[i]
        home_indices=[j for j ,e in enumerate(ratings['team-id']) if e == home_id]
        home_rating=np.mean(ratings['rating'][home_indices[:1]])
        home_onfield_rating=np.zeros(14)
        home_defensive_rating=[]
        home_offensive_rating=[]
        home_midfield_rating=[]
        home_GK_rating=[]
        weight_sum_d=0.0
        weight_sum_o=0.0
        weight_sumGK=0.0
        weight_sum_mid=0.0
        
        for j in range(14):
            player_id=roster['h_p'+str(j+1)+'_id'][i]
            #print(player_id)
            if (math.isnan(player_id) or player_id==0):  #in case of an unregistered player
                home_onfield_rating[j]=0
                home_defensive_rating.append(0)
                home_offensive_rating.append(0)
                home_midfield_rating.append(0)
            else:
                player_index=ratings['player-id'][home_indices][ratings['player-id'][home_indices]==player_id].index[0]
                #print(ratings['player-id'][player_index])
                home_onfield_rating[j]=ratings['rating'][player_index]
                #print()
                if roster['h_p'+str(j+1)+'_pos'][i]=='Goalkeeper':
                    home_GK_rating.append(ratings['rating'][player_index]*min(np.abs(roster['h_p'+str(j+1)+'_t'][i])\
                                                                             ,1))
                    weight_sumGK += min(np.abs(roster['h_p'+str(j+1)+'_t'][i]),1)
                    
                elif roster['h_p'+str(j+1)+'_pos'][i]=='Defender':
                    home_defensive_rating.append(ratings['rating'][player_index]*min(np.abs(roster['h_p'+str(j+1)+'_t'][i])\
                                                                             ,1))
                    weight_sum_d += min(np.abs(roster['h_p'+str(j+1)+'_t'][i]),1)
                    
                elif roster['h_p'+str(j+1)+'_pos'][i]=='Midfielder':
                    home_midfield_rating.append(ratings['rating'][player_index]*min(np.abs(roster['h_p'+str(j+1)+'_t'][i])\
                                                                             ,1))
                    weight_sum_mid += min(np.abs(roster['h_p'+str(j+1)+'_t'][i]),1)
                else:
                    home_offensive_rating.append(ratings['rating'][player_index]*min(np.abs(roster['h_p'+str(j+1)+'_t'][i])\
                                                                             ,1))
                    weight_sum_o += min(np.abs(roster['h_p'+str(j+1)+'_t'][i]),1)
                    
                    
        home_GK=float(np.sum(home_GK_rating))/float(weight_sumGK)
        home_def=float(np.sum(home_defensive_rating))/float(weight_sum_d)
        home_offense=float(np.sum(home_offensive_rating))/float(weight_sum_o)
        home_midfield=float(np.sum(home_midfield_rating))/float(weight_sum_mid)
        home_rating_onroster=np.mean(home_onfield_rating[np.where(home_onfield_rating !=0)])
        
        away_indices=[j for j ,e in enumerate(ratings['team-id']) if e == away_id]
        away_rating=np.mean(ratings['rating'][away_indices[:1]])
        away_onfield_rating=np.zeros(14)
        
        away_defensive_rating=[]
        away_offensive_rating=[]
        away_midfield_rating=[]
        away_GK_rating=[]
        weight_sum_d=0
        weight_sum_o=0
        weight_sumGK=0
        weight_sum_mid=0
        
        for j in range(14):
            player_id=roster['a_p'+str(j+1)+'_id'][i]
            if (math.isnan(player_id) or player_id==0):
                #home_onfield_rating[j]=0
                away_onfield_rating[j]=0
                away_defensive_rating.append(0)
                away_offensive_rating.append(0)
                away_midfield_rating.append(0)
            else:
            
                player_index=ratings['player-id'][away_indices][ratings['player-id'][away_indices]==player_id].index[0]
                away_onfield_rating[j]=ratings['rating'][player_index]
                
                if roster['a_p'+str(j+1)+'_pos'][i]=='Goalkeeper':
                    away_GK_rating.append(ratings['rating'][player_index]*min(np.abs(roster['a_p'+str(j+1)+'_t'][i])\
                                                                             ,1))
                    weight_sumGK += min(np.abs(roster['a_p'+str(j+1)+'_t'][i]),1)
                elif roster['a_p'+str(j+1)+'_pos'][i]=='Defender':
                    away_defensive_rating.append(ratings['rating'][player_index]*min(np.abs(roster['a_p'+str(j+1)+'_t'][i])\
                                                                             ,1))
                    weight_sum_d += min(np.abs(roster['a_p'+str(j+1)+'_t'][i]),1)
                
                elif roster['a_p'+str(j+1)+'_pos'][i]=='Midfielder':
                    away_midfield_rating.append(ratings['rating'][player_index]*min(np.abs(roster['a_p'+str(j+1)+'_t'][i])\
                                                                             ,1))
                    weight_sum_mid += min(np.abs(roster['a_p'+str(j+1)+'_t'][i]),1)
                else:
                    away_offensive_rating.append(ratings['rating'][player_index]*min(np.abs(roster['a_p'+str(j+1)+'_t'][i])\
                                                                             ,1))
                    weight_sum_o += min(np.abs(roster['a_p'+str(j+1)+'_t'][i]),1)
                    
        away_GK=float(np.sum(away_GK_rating))/float(weight_sumGK)
        away_def=float(np.sum(away_defensive_rating))/float(weight_sum_d)
        away_offense=float(np.sum(away_offensive_rating))/float(weight_sum_o)
        away_midfield=float(np.sum(away_midfield_rating))/float(weight_sum_mid)
        away_rating_onroster=np.mean(away_onfield_rating[np.where(home_onfield_rating !=0)])
                    
                                   
        away_rating_onroster=np.mean(away_onfield_rating[np.where(away_onfield_rating !=0)])
        features.append([str(home_rating_onroster),str(home_GK),str(home_def),str(home_midfield),str(home_offense),\
                         str(away_rating_onroster),str(away_GK),str(away_def),str(away_midfield),\
                         str(away_offense),str(label)])
        
    return features
        
    #for 

In [7]:
def make_extensive_match_vector(roster,ratings,results,features):
    matches=results['match_id']
    #print(results)
    goal_diff=(results['home_goals'])-(results['away_goals'])
    teams_home=roster['h_id']
    teams_away=roster['a_id']
    
    
    #print(roster)
    features=[]
    for i in range(len(matches)):
        match_id=matches[i]
        print(match_id)
        if goal_diff[i]<0:
            label=-1
        elif goal_diff[i]==0:
            label=0
        else:
            label=1
            
        home_id=teams_home[i]
        away_id=teams_away[i]
        home_indices=[j for j ,e in enumerate(ratings['team-id']) if e == home_id]
        home_rating=np.mean(ratings['rating'][home_indices[:1]])
        away_indices=[j for j ,e in enumerate(ratings['team-id']) if e == away_id]
        away_rating=np.mean(ratings['rating'][away_indices[:1]])
        
        feature_vector=np.zeros(281)
        feature_vector[-1]= label
        
        
        feature_list=['rating','PS%','AvgP','SpG','KeyP','Blocks','Inter']
        defender_h_count=0
        midfielder_h_count=0
        forward_h_count=0
        #nan_count=0
        sub_h_count=0
        defender_a_count=0
        midfielder_a_count=0
        forward_a_count=0
        #nan_count=0
        sub_a_count=0
        for j in range(14):
            
            
            player_id=roster['h_p'+str(j+1)+'_id'][i]
            #print(player_id)
            if (math.isnan(player_id) or player_id==0):
                feature_vector[(17+sub_h_count)*7:(17+sub_h_count)*7+1]=0
                sub_h_count+=1
            else:
                if j<11:
                    player_index=ratings['player-id'][home_indices][ratings['player-id'][home_indices]==player_id]\
                    .index[0]
                    if roster['h_p'+str(j+1)+'_pos'][i]=='Goalkeeper':
                        for l in range(7):
                            
                            feature_vector[l]=ratings[feature_list[l]][player_index]*min(np.abs(roster['h_p'+str(j+1)+'_t'][i])\
                                                                             ,1)
                       
                    elif roster['h_p'+str(j+1)+'_pos'][i]=='Defender':
                        for l in range(7):
                            feature_vector[(defender_h_count+1)*7+l]=ratings[feature_list[l]][player_index]*min(np.abs(roster['h_p'+str(j+1)+'_t'][i])\
                                                                             ,1)
                        
                        defender_h_count+=1
                        
                    elif roster['h_p'+str(j+1)+'_pos'][i]=='Midfielder':
                        for l in range(7):
                            feature_vector[(midfielder_h_count+7)*7+l]=ratings[feature_list[l]][player_index]*min(np.abs(roster['h_p'+str(j+1)+'_t'][i])\
                                                                             ,1)
                        
                        midfielder_h_count+=1
                        
                    elif roster['h_p'+str(j+1)+'_pos'][i]=='Forward':
                        for l in range(7):
                            feature_vector[(forward_h_count+13)*7+l]=ratings[feature_list[l]][player_index]*min(np.abs(roster['h_p'+str(j+1)+'_t'][i])\
                                                                             ,1)
                        
                        forward_h_count+=1
                        
                else:
                    for l in range(7):
                        feature_vector[(sub_h_count+17)*7+l]=ratings[feature_list[l]][player_index]*min(np.abs(roster['h_p'+str(j+1)+'_t'][i])\
                                                                             ,1)
                    sub_h_count+=1
                        
            #defender_count=0
            #midfielder_count=0
            #forward_count=0
            #nan_count=0
            #sub_count=0            
            player_id=roster['a_p'+str(j+1)+'_id'][i]
            #print(player_id)
            if (math.isnan(player_id) or player_id==0):
                feature_vector[140+(17+sub_a_count)*7:140+(17+sub_a_count+1)*7]=0
                sub_a_count+=1
            else:
                if j<11:
                    player_index=ratings['player-id'][away_indices][ratings['player-id'][away_indices]==player_id]\
                    .index[0]
                    if roster['a_p'+str(j+1)+'_pos'][i]=='Goalkeeper':
                        for l in range(7):
                            
                            feature_vector[l+140]=ratings[feature_list[l]][player_index]*min(np.abs(roster['a_p'+str(j+1)+'_t'][i])\
                                                                             ,1)
                       
                    elif roster['a_p'+str(j+1)+'_pos'][i]=='Defender':
                        for l in range(7):
                            feature_vector[(defender_a_count+1)*7+l+141]=ratings[feature_list[l]][player_index]*min(np.abs(roster['a_p'+str(j+1)+'_t'][i])\
                                                                             ,1)
                        
                        defender_a_count+=1
                        
                    elif roster['a_p'+str(j+1)+'_pos'][i]=='Midfielder':
                        for l in range(7):
                            feature_vector[(midfielder_a_count+7)*7+l+140]=ratings[feature_list[l]][player_index]*min(np.abs(roster['a_p'+str(j+1)+'_t'][i])\
                                                                             ,1)
                        
                        midfielder_a_count+=1
                        
                    elif roster['a_p'+str(j+1)+'_pos'][i]=='Forward':
                        for l in range(7):
                            feature_vector[(forward_a_count+13)*7+140+l]=ratings[feature_list[l]][player_index]*min(np.abs(roster['a_p'+str(j+1)+'_t'][i])\
                                                                             ,1)
                        
                        forward_a_count+=1
                        
                else:
                    for l in range(7):
                        #print('sub count='+str(sub_count))
                        feature_vector[(sub_a_count+17)*7+140+l]=ratings[feature_list[l]][player_index]*min(np.abs(roster['a_p'+str(j+1)+'_t'][i])\
                                                                             ,1)
                    sub_a_count+=1            
                    
        features.append(feature_vector.tolist())                
                        
                    
    return features    

The cell above is creating a team vector of the form array(1x7+6x7+6x7+4x7) for the starting lineup and then the substitute info is appended (each team has a 3x7 cell) at the end. All the ratings are weighted by the time played to represent the time played 

In [8]:
def main(match_vector_func=make_match_vectors):
    markers=['1','+','x']
    colors=['g','b','r']
    seasons_for_ratings=['2009-2010','2010-2011','2011-2012','2012-2013','2013-2014','2014-2015','2015-2016'\
                         ,'2016-2017','2017-2018','2018-2019']
    seasons=['0910','1011','1112','1213','1314','1415','1516','1617','1718','1819']
    
    features=[]
    if match_vector_func==make_match_vectors:
        filewrite=open('match_vectors_test.csv','a+')
        header=['h_roster_rating','h_gk_rating','h_def_rating','h_mid_rating','h_off_rating',\
                'a_roster_rating','a_gk_rating','a_def_rating','a_mid_rating','a_off_rating','label']
        filewrite.write(', '.join(header)+'\n')
        filewrite.close()
        plt.figure()
        for i in range(len(seasons)):
            ratings=get_season_ratings(seasons_for_ratings[i])
            results=get_match_results(seasons[i])
            roster=get_match_roster(seasons[i])
            features=match_vector_func(roster,ratings,results,features)
            filewrite=open('match_vectors.csv','a+')
            for feature in features:
                filewrite.write(', '.join(feature)+'\n')
                plt.scatter(float(feature[0]),float(feature[5]), marker=markers[int(feature[-1])],color=colors[int\
                                                                                                        (feature[-1])])
            filewrite.close()
        
    else:
        filewrite=open('match_vectors_extended.csv','a+')
        header=[]
        #header.append('home_team_rating')
        for i in range(20):
            header.append('home_player'+str(i)+'_rating')
            header.append('home_player'+str(i)+'_pass_success')
            header.append('home_player'+str(i)+'_passes_per_g')
            header.append('home_player'+str(i)+'_shots_per_g')
            header.append('home_player'+str(i)+'_key_passes_per_g')
            header.append('home_player'+str(i)+'_blocks')
            header.append('home_player'+str(i)+'_interceptions')
        #header.append('away_team_rating')
        for i in range(20):
            header.append('away_player'+str(i)+'_rating')
            header.append('away_player'+str(i)+'_pass_success')
            header.append('away_player'+str(i)+'_passes_per_g')
            header.append('away_player'+str(i)+'_shots_per_g')
            header.append('away_player'+str(i)+'_key_passes_per_g')
            header.append('away_player'+str(i)+'_blocks')
            header.append('away_player'+str(i)+'_interceptions')
        header.append('result')
        filewrite.write(', '.join(header)+'\n')
        filewrite.close()
        for i in range(len(seasons)):
            ratings=get_season_ratings(seasons_for_ratings[i])
            results=get_match_results(seasons[i])
            roster=get_match_roster(seasons[i])
            features=match_vector_func(roster,ratings,results,features)
            filewrite=open('match_vectors_extended.csv','a+')
            for feature in features:
                filewrite.write(', '.join(str(x) for x in feature)+'\n')
            
            filewrite.close()
        
            
        
            
        
    #for feature in features:
        

In [9]:
if __name__ == '__main__':
    main(make_extensive_match_vector)

317783
317786
317787
317788
317789
317790
317791
317792
317793
317794
317795
317796
317797
317798
317799
317800
317801
317802
317803
317804
317805
317806
317807
317808
317809
317810
317811
317812
317813
317814
317815
317816
317817
317818
317819
317820
317821
317822
317823
317824
317825
317826
317827
317828
317829
317830
317831
317832
317833
317834
317835
317836
317837
317838
317839
317840
317841
317842
317843
317844
317845
317882
317883
317884
317885
317886
317887
317888
317889
317890
317891
317892
317893
317894
317895
317896
317897
317898
317899
317900
317901
317902
317903
317904
317905
317906
317907
317908
317909
317910
317911
317912
317913
317914
317915
317916
317917
317918
317919
317920
317921
317922
317923
317924
317925
317926
317927
317928
317929
317930
317931
317932
317933
317934
317935
317936
317937
317938
317939
317940
317941
317942
318003
318004
318005
318006
318007
318008
318009
318010
318011
318012
318013
318014
318015
318016
318017
318018
318019
318020
318021
318022
318023

614135
614136
614137
614138
614139
614140
614141
614142
614147
614150
614153
614157
614161
614164
614168
614172
614173
614176
614188
614189
614190
614192
614197
614200
614203
614207
614210
614211
614220
614224
614226
614233
614265
614285
614307
614336
614361
614384
614970
614971
614972
614973
614974
614975
614976
614978
614979
614980
614981
614982
614983
614985
614986
614987
614988
614989
614990
614991
614992
614993
614994
614996
614997
614998
614999
615000
615002
615003
615004
615005
615006
615007
615008
615009
615010
615012
615013
615014
615015
615016
615017
615018
615019
615020
615021
615022
615023
615024
615025
615026
615027
615028
615029
615030
615031
615032
615034
615035
615036
615037
615038
615039
615040
615041
615042
615043
615044
615045
615046
615047
615048
615049
615050
615051
615052
615053
615054
615055
615056
615058
615059
615060
615061
615062
615064
615065
615066
615067
615068
615069
615070
615071
615072
615073
615074
615075
615078
615079
615080
615081
615082
615083
615084

959601
959602
959603
959604
959605
959607
959608
959609
959610
959611
959612
959613
959614
959615
959616
959617
959618
959619
959620
959621
959622
959623
959624
959625
959626
959627
959628
959629
959630
959631
959632
959633
959634
959635
959636
959637
959638
959639
959640
959641
959642
959643
959644
959645
959646
959647
959648
959649
959650
959651
959652
959653
959654
959655
959656
959657
959658
959659
959660
959661
959662
959663
959664
959665
959666
959667
959668
959669
959670
959671
959672
959673
959674
959675
959676
959677
959678
959679
959680
959681
959682
959683
959684
959685
959686
959687
959688
959689
959690
959691
959692
959693
959694
959695
959696
959697
959698
959699
959700
959701
959702
959703
959704
959705
959706
959707
959708
959709
959710
959711
959712
959713
959714
959715
959716
959717
959718
959719
959720
959721
959722
959723
959724
959725
959726
959727
959728
959729
959730
959731
959732
959733
959734
959735
959736
959737
959738
959739
959740
959741
959742
959743
959744

1190549
1190550
1190551
1190552
1190553
1190240
1190299
1190323
1190445
1190458
1284741
1284742
1284743
1284744
1284745
1284746
1284747
1284748
1284749
1284750
1284751
1284752
1284753
1284754
1284755
1284756
1284757
1284758
1284759
1284760
1284761
1284762
1284763
1284764
1284765
1284766
1284767
1284768
1284769
1284770
1284771
1284772
1284773
1284774
1284775
1284776
1284777
1284778
1284779
1284780
1284781
1284782
1284783
1284784
1284785
1284786
1284787
1284788
1284789
1284790
1284791
1284792
1284793
1284794
1284795
1284796
1284797
1284799
1284800
1284801
1284802
1284803
1284804
1284805
1284806
1284807
1284808
1284809
1284810
1284811
1284812
1284813
1284814
1284815
1284816
1284817
1284818
1284820
1284821
1284822
1284823
1284824
1284825
1284826
1284827
1284828
1284829
1284830
1284831
1284832
1284833
1284834
1284835
1284836
1284837
1284838
1284840
1284841
1284842
1284843
1284844
1284845
1284846
1284847
1284848
1284849
1284850
1284851
1284852
1284853
1284854
1284855
1284856
1284857
1284858


In [25]:
x=np.array([1,2,3,4]).tolist()
x[-1]=5
print(','.join(str(x_i) for x_i in x ))

1,2,3,5
