In [1]:
#import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression

In [3]:
#class prepare_dataframes serves to compute unique features of each player / tour


In [16]:
class prepare_dataframes():
    def __init__(self):
        # Load stats csv
        self.stats_df= pd.read_csv('./data/stats.csv')
        
        ### We tried different combinations of these features ( from tour.csv) but it didn't improve our model accuracy 
        #self.tour_df= pd.read_csv('./data/tour.csv',usecols=["ID_T","RANK_T","LATITUDE_T","LONGITUDE_T" ]).fillna(0)
        #self.tour_df.set_index(['ID_T'], inplace=True)
        
        # Load players csv
        self.players_df= pd.read_csv("./data/players.csv", usecols=["ID_P","RANK_P"]).fillna(0)
        self.player_ids = self.players_df.ID_P.unique().tolist()
        # Index for query purposes
        self.players_df.set_index(['ID_P'], inplace=True)
        
        ### players_features_df contains one unique player per line and some of the features we tried for each player  
        self.players_features_df= self.compute_players ()

    #compute_stats computes the average of some features in stats.csv (per unique player)
       
    def compute_stats(self,_id):
        try: 
            
            df_id_wins= self.stats_df[self.stats_df['ID1'] == _id]
            df_id_loses= self.stats_df[self.stats_df['ID2'] == _id]
            
            if len(df_id_wins) != 0:
                break_point= ((df_id_wins['BP_1']/df_id_wins['BPOF_1']).sum())*100/len(df_id_wins)
            else :
                break_point=0



            len_sum = len(df_id_wins)+len(df_id_loses)
            if len_sum != 0:
                percentage = len(df_id_wins) *100 / len_sum 
            else:
                percentage=0
        except :
            return((0,0))
        finally:    
            return((break_point, percentage))
   

    def get_rank ( self, _id):
        return(self.players_df.loc[_id].tolist()[0])

    #compute_players is a method that computes players_features_df. Not all features will be used in the model

    def compute_players (self):
        
            players=[]
            for _id in self.player_ids:
                break_point, percentage= self.compute_stats(_id)
                players.append([_id, break_point, percentage, self.get_rank ( _id)])
            players_features_df= pd.DataFrame(players, columns= ["ID_P","break_point","percentage", "rank"])

            return(players_features_df)
    
        

In [17]:
# The Tennis_model class serves to build our LR model


In [20]:
class tennis_model():
    def __init__(self):
        #load train data
        self.input_df= pd.read_csv("./data/train.csv")
        #load test data
        self.test_df = pd.read_csv("./data/test.csv") 
        self.d= prepare_dataframes()
        self.LR= self.train()
    
    def compute_train_feature(self):
        features, output = [], []
        counter=1
        for index, row in self.input_df.iterrows():
            id1 = row['ID1_G']
            id2 = row['ID2_G']
            player1 = self.d.players_features_df[self.d.players_features_df['ID_P'] == id1]
            player2=  self.d.players_features_df[self.d.players_features_df['ID_P'] == id2]
            per1= player1['percentage'].tolist()[0]
            per2= player2['percentage'].tolist()[0]
            #point_diff= player1['points'].tolist()[0]-player2['points'].tolist()[0]
            #break_point_diff= player1['break_point'].tolist()[0]-player2['break_point'].tolist()[0]
            
            RANK1= int(player1['rank'].tolist()[0])
            RANK2= int(player2['rank'].tolist()[0])
            

            # We use a counter to control switching player 1 and player 2 positions since we want two outputs values 1 and 2
            # Counter can be 1 or 0. if counter is equal to 1 we switch the positions else we do nothing. 
            if counter == 1:
                    features.append([row['ID2_G'],row['ID1_G'],row['ID_T_G'],row['ID_R_G'], 
                                     per2-per1,
                                     RANK2-RANK1])
                    output.append(2)
                    counter -= 1
            else:
                    features.append([row['ID1_G'],row['ID2_G'],row['ID_T_G'],row['ID_R_G'],
                                     per1-per2,
                                     RANK1-RANK2])
                    output.append(1)
                    counter += 1
        return((features, output))

    def compute_test_feature(self):
            features = []
            for index, row in self.test_df.iterrows():
                id1 = row['ID1_G']
                id2 = row['ID2_G']
                player1 = self.d.players_features_df[self.d.players_features_df['ID_P'] == id1]
                player2=  self.d.players_features_df[self.d.players_features_df['ID_P'] == id2]
                
                per1= player1['percentage'].tolist()[0]
                per2= player2['percentage'].tolist()[0]
                #point_diff= player1['points'].tolist()[0]-player2['points'].tolist()[0]
                #break_point_diff= player1['break_point'].tolist()[0]-player2['break_point'].tolist()[0]

                RANK1= int(player1['rank'].tolist()[0])
                RANK2= int(player2['rank'].tolist()[0])


                features.append([row['ID1_G'],row['ID2_G'],row['ID_T_G'],row['ID_R_G'],
                                per1-per2,RANK1-RANK2])

            return(features)
    
        
    def train(self):

        features, output= self.compute_train_feature()
        X= pd.DataFrame(features)
        Y= pd.DataFrame(output)

        X_train, X_test, y_train, y_test = train_test_split(features, output, test_size=0.2, random_state=42)
        # Train classifier Logistic Regression
        LR = LogisticRegression(C=1.0, intercept_scaling=1,   
                            dual=False, fit_intercept=True, penalty='l2', tol=0.0001)
        LR.fit(X_train, y_train)

        # Predict
        y_pred = LR.predict(X_test)


        confusion_mx = confusion_matrix(y_test, y_pred)
        
        print("Training Logistic Regression is done. The training report obtained is:")
        # print(confusion_mx)
        print(classification_report(y_test, y_pred))
        return(LR)
        
    def predict(self):
        features= self.compute_test_feature()
        X= pd.DataFrame(features)
        # Predict
        y_pred = self.LR.predict(X)
        self.test_df.insert(5, "RESULT_G", y_pred)
        print("prediction results are saved in './data/result.csv'")
        self.test_df.to_csv("./data/result.csv")

In [21]:
tennis_model = tennis_model()
tennis_model.predict()

Training Logistic Regression is done. The training report obtained is:
             precision    recall  f1-score   support

          1       0.70      0.70      0.70      7327
          2       0.70      0.69      0.70      7347

avg / total       0.70      0.70      0.70     14674

prediction results are saved in './data/result.csv'
