# TTTM - Table Tennis Score Prediction - Israel
## in this project we will try to predict the score of the matches, and the forms that are filled by each team


### Initial imports

In [1]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
import numpy as np
from datetime import datetime
import re

### Open csv 

In [2]:
def read_csv(filename):
    return pd.read_csv('./'+filename)
    

### Dataset - links to every player personal profile page

#### crwaling to 'http://www.tttm.co.il', preparing the first dataset that will contain each player personal link

In [3]:
players_hrefs=[]
top_players_df = None

def get_players_df():
    print("Start")
    for i in range(1,4):
        url = f'http://www.tttm.co.il/rk/MS-{i}/%D7%98%D7%A0%D7%99%D7%A1-%D7%A9%D7%95%D7%9C%D7%97%D7%9F-%D7%93%D7%99%D7%A8%D7%95%D7%92-%D7%92%D7%91%D7%A8%D7%99%D7%9D'
        res = requests.get(url)
        soup = BeautifulSoup(res.content,'html.parser')
        players = soup.find('table').find_all('tr')
        for player in players:
            a = player.find('a')
            try:
                players_hrefs.append(a['href'])
            except:
                continue


    players_to_df = {"name":[],"club":[],"rank":[],"category":[],"points":[],"id":[],"link":[]}

    for h in players_hrefs:
        res = requests.get('http://tttm.co.il'+h)
        soup = BeautifulSoup(res.content,'html.parser')
        name = soup.find('div',attrs={"class":"playerName"}).text
        player_presentation = soup.find('div',attrs={"class":"playerPresentation"})
        club = player_presentation.find('a').text
        rank = int(player_presentation.find('table').find('b').text)
        cat_points = player_presentation.find_all('span')
        category = cat_points[0].find('b').text
        points = float(cat_points[1].find('b').text)    
        player_id = player_presentation.find('b').text.strip('\n').strip()
        print(name)
        players_to_df['name'].append(name)
        players_to_df['club'].append(club)
        players_to_df['rank'].append(rank)
        players_to_df['category'].append(category)
        players_to_df['points'].append(points)
        players_to_df['id'].append(player_id)
        players_to_df['link'].append(h)



    top_players_df = pd.DataFrame.from_dict(players_to_df)
    top_players_df['name'] = top_players_df['name'].str.strip('\n ')
    top_players_df['club'] = top_players_df['club'].str.strip('\n ')
    top_players_df.to_csv('players_table.csv' , encoding = "utf-8-sig")
    
    

## Trying to open the players_table.csv. if it doesnt work we will use the function above


In [4]:

try:
    top_players_df = read_csv('./players_table.csv')
    players_hrefs = top_players_df['link']
except:
    get_players_df()


In [5]:
try:
    top_players_df = top_players_df.drop('Unnamed: 0',axis=1)
except:
    pass

In [6]:
top_players_df

Unnamed: 0,name,club,rank,category,points,id,link
0,יונתן שוסטרמן,מ. בני הרצליה,1,S,1778.3,745,/p/745/יונתן-שוסטרמן
1,מיכאל טאובר,עירוני גבעתיים,2,S,1697.3,853,/p/853/מיכאל-טאובר
2,טל ישראלי,הפועל חיפה,3,E19,1666.4,651,/p/651/טל-ישראלי
3,אביב בן ארי,מכבי זאב זכרון יעקב,4,S,1619.7,524,/p/524/אביב-בן-ארי
4,יניב שרון,הפועל לוד,5,S40,1574.1,882,/p/882/יניב-שרון
...,...,...,...,...,...,...,...
295,ויאצ'סלב גנין,בית''ר אשדוד,296,S40,681.1,4673,/p/4673/ויאצ-סלב-גנין
296,רונן גבע,הפועל יוקנעם,297,S40,681.0,4441,/p/4441/רונן-גבע
297,אולג אוישר,הפועל טנ''ש באר שבע,298,S50,679.8,1409,/p/1409/אולג-אוישר
298,יקיר טאיבי,טניס שולחן חיפה,299,S,675.9,6923,/p/6923/יקיר-טאיבי


### Let's impelement a function to return a single row of a player
#### get_player_row(player_name) -> return player's row

In [7]:
def get_player_row(player_name):
    try:
        row = top_players_df[top_players_df['name']==player_name]
        return row
    except:
        return None
        





In [8]:
yoni_row = get_player_row('יוני ירמיהו')
yoni_row['name'].values[0]

'יוני ירמיהו'

## After saving the first dataframe as csv we need to acquire more data

### Creating a new dataset for all the games that the players which we crawled to achieve their data earlier

In [9]:
games_to_df = {
    "match_id":[],
    "match_type":[],
    "date":[],
    "p1_id":[],
    "p1_name":[],
    "p1_club":[],
    "p1_rank":[],
    "p1_sets":[],
    "p1_home":[],
    "p1_points_gained":[],
    "p2_id":[],
    "p2_name":[],
    "p2_club":[],
    "p2_rank":[],
    "p2_sets":[],
    "p2_home":[],
    "p2_points_gained":[],
    "winner_id":[]
    

}

## Creating the matches dataset

### We will now crawl on every player and create a dataset of all the games that a player played 


In [10]:
for h in range(220):
    try:
        res = requests.get('http://tttm.co.il'+players_hrefs[h])
        soup = BeautifulSoup(res.content,'html.parser')
        pages_arr = soup.find('div',attrs={"class":"rankpages"}).find_all('a')
        player_presentation = soup.find('div',attrs={"class":"playerPresentation"})
        player_details = {}
        player_details['name'] = player_presentation.find('div',attrs={"class":"playerName"}).text.strip('\n').strip()
        player_details['id'] = player_presentation.find('b').text.strip('\n').strip()
        player_details['club'] = player_presentation.find('a').text.strip('\n').strip()
        print("Player number: ",h," ",player_details)
        for p in range(9):
            res = requests.get('http://tttm.co.il'+pages_arr[p]['href'])
            soup = BeautifulSoup(res.content,'html.parser')
            table = soup.find('table',attrs={"class":"lstMatchs"})
            trays = table.findAll('tr')
            for tr in trays:
                try:    
                    match_type = tr.find('a',attrs={"class":"fsClub"}).text.strip('\r\n                        ')
                    td_list = tr.find_all('td')
                    match_id = td_list[0].text
                    date_string = td_list[2].text
                    p1_points_gained = 0 
                    p2_points_gained = 0
                    try:
                        p1_points_gained = float(td_list[10].find('span').text)
                        delta = float(td_list[9].find('b').text)
                        if p1_points_gained < 0:
                            p2_points_gained = (abs(p1_points_gained)*delta)
                        else:
                            p2_points_gained = float(p1_points_gained/delta)
                            p2_points_gained = (-1)*(p2_points_gained)
                    except:
                            p1_points_gained = 0
                            p2_points_gained = 0
                    p1_id = player_details['id']
                    p1_name = player_details['name']
                    p1_club = player_details['club']
                    p1_rank = td_list[1].find('span').text
                    p1_rank = float(re.findall("\d+\.\d+", p1_rank)[0])
                    p2_rank = td_list[6].text
                    p2 = td_list[5].find_all('a')
                    p2_name = p2[0].text.strip('    \r\n                    ')
                    p2_club = p2[1].text
                    p2_rank = float(td_list[6].text)
                    p2_id = top_players_df[top_players_df['name']==p2_name]["id"].values[0]
                    sets = td_list[8].text.split('-')
                    p2_sets = int(sets[0])
                    p1_sets = int(sets[1])
                    p2_home = np.nan
                    p1_home = np.nan
                    try:
                        teams = td_list[3].find('i').text.split('-')
                        if p2_club in teams[0]:
                            p2_home = 1
                            p1_home = 0
                        
                        elif p2_club in teams[1]:
                                p1_home = 1
                                p2_home = 0

                        games_to_df['p2_home'].append(p2_home)
                        games_to_df['p1_home'].append(p1_home)
                       
                    except:
                        games_to_df['p2_home'].append(p2_home)
                        games_to_df['p1_home'].append(p1_home)
                    
                    
                    
                    
                    date_time_obj = datetime.strptime(date_string, '%d/%m/%Y')
                    
                    winner = p1_id if p1_sets > p2_sets else p2_id
                    games_to_df['p1_points_gained'].append(p1_points_gained)
                    games_to_df['p2_points_gained'].append(p2_points_gained)
                    games_to_df['match_id'].append(match_id)
                    games_to_df['match_type'].append(match_type)
                    games_to_df['date'].append(date_time_obj)
                    games_to_df['p1_id'].append(p1_id)
                    games_to_df['p1_club'].append(p1_club)
                    games_to_df['p1_name'].append(p1_name)
                    games_to_df['p1_rank'].append(p1_rank)
                    games_to_df['p2_id'].append(p2_id)
                    games_to_df['p2_club'].append(p2_club)
                    games_to_df['p2_name'].append(p2_name)
                    games_to_df['p2_rank'].append(p2_rank)

                    games_to_df['p1_sets'].append(p1_sets)
                    games_to_df['p2_sets'].append(p2_sets)
                    games_to_df['winner_id'].append(winner)
                    
                except:
                    continue
    except:
        continue

                
    

Player number:  0   {'name': 'יונתן שוסטרמן', 'id': '745', 'club': 'מ. בני הרצליה'}
Player number:  1   {'name': 'מיכאל טאובר', 'id': '853', 'club': 'עירוני גבעתיים'}
Player number:  2   {'name': 'טל ישראלי', 'id': '651', 'club': 'הפועל חיפה'}
Player number:  3   {'name': 'אביב בן ארי', 'id': '524', 'club': 'מכבי זאב זכרון יעקב'}
Player number:  4   {'name': 'יניב שרון', 'id': '882', 'club': 'הפועל לוד'}
Player number:  5   {'name': 'רון דוידוביץ', 'id': '881', 'club': 'הפועל עירוני נוף הגליל'}
Player number:  6   {'name': 'עמרי בן ארי', 'id': '894', 'club': 'עירוני גבעתיים'}
Player number:  7   {'name': 'מתן סימון', 'id': '341', 'club': 'עירוני גבעתיים'}
Player number:  8   {'name': 'איתי אביבי', 'id': '2284', 'club': 'הפועל חיפה'}
Player number:  9   {'name': 'עמית גורן', 'id': '676', 'club': 'הפועל עירוני נוף הגליל'}
Player number:  10   {'name': 'אור בשן', 'id': '865', 'club': 'מ. בני הרצליה'}
Player number:  11   {'name': 'איסק אברמוב', 'id': '874', 'club': "בית''ר ראשון לציון"}
P

Player number:  104   {'name': 'שימי אסרף', 'id': '926', 'club': 'הפועל חיפה'}
Player number:  105   {'name': 'אדר שקד', 'id': '857', 'club': 'הפועל לוד'}
Player number:  106   {'name': 'פיליפ קושנר', 'id': '2387', 'club': 'מכבי גבירול באר שבע'}
Player number:  107   {'name': 'דוד גולן', 'id': '922', 'club': 'מכבי הישגים ראשון לציון'}
Player number:  108   {'name': 'דניאל זהר', 'id': '2028', 'club': 'אופק קריית אונו'}
Player number:  109   {'name': 'רביד ברנדמן', 'id': '3833', 'club': 'אופק קריית אונו'}
Player number:  110   {'name': 'יניב בלנק', 'id': '929', 'club': "בית''ר רמלה"}
Player number:  111   {'name': 'דני ליבשיץ', 'id': '293', 'club': 'הפועל ירושלים'}
Player number:  112   {'name': 'איגור סלבין', 'id': '1907', 'club': 'הפועל אשקלון'}
Player number:  113   {'name': 'דן זוילי', 'id': '2229', 'club': 'הפועל לוד'}
Player number:  114   {'name': 'יוסי רוזנבלט', 'id': '998', 'club': 'הפועל אשדוד'}
Player number:  115   {'name': 'איגור גולדברג', 'id': '1359', 'club': 'הפועל בת ים'

Player number:  206   {'name': 'ערן ניר', 'id': '7025', 'club': 'מכבי זאב זכרון יעקב'}
Player number:  207   {'name': 'דיאגו שיקינסקי', 'id': '1526', 'club': 'הפועל עירוני נוף הגליל'}
Player number:  210   {'name': 'שי אוחיון', 'id': '5856', 'club': 'טנש ירושלים'}
Player number:  211   {'name': 'עידו טל', 'id': '3035', 'club': 'מכבי גדרה'}
Player number:  212   {'name': 'פבל סטסנקו', 'id': '2189', 'club': 'הפועל בת ים'}
Player number:  213   {'name': 'אילן דרמון', 'id': '322', 'club': 'מ. בני הרצליה'}
Player number:  214   {'name': 'אלכסנדר שפירו', 'id': '2629', 'club': 'אליצור קריית אתא'}
Player number:  215   {'name': 'גלעד לוין', 'id': '2720', 'club': "בית''ר ראשון לציון"}
Player number:  216   {'name': 'עידן עופר', 'id': '6174', 'club': 'מכבי רחובות'}
Player number:  217   {'name': 'יניב יוסף', 'id': '1333', 'club': "מ.ק.מתנ''ס קריית גת"}
Player number:  218   {'name': 'טומי גרינפלד', 'id': '4167', 'club': "בית''ר ראשון לציון"}
Player number:  219   {'name': 'מקסים קוקוטוב', 'id': 

In [11]:
games_df = pd.DataFrame.from_dict(games_to_df)
games_df

Unnamed: 0,match_id,match_type,date,p1_id,p1_name,p1_club,p1_rank,p1_sets,p1_home,p1_points_gained,p2_id,p2_name,p2_club,p2_rank,p2_sets,p2_home,p2_points_gained,winner_id
0,286499,ליגת על גברים 2021-2022,2021-12-28,745,יונתן שוסטרמן,מ. בני הרצליה,1778.3,2,0.0,-21.0,741,גל פרפל,הישגי כרמיאל,1430.4,3,1.0,31.5,741
1,282431,ליגת על גברים 2021-2022,2021-12-14,745,יונתן שוסטרמן,מ. בני הרצליה,1778.3,3,1.0,4.5,685,יניב קרמזין,כרמל מרכזי טניס שולחן חיפה,1412.8,0,0.0,-3.0,745
2,282435,ליגת על גברים 2021-2022,2021-12-14,745,יונתן שוסטרמן,מ. בני הרצליה,1778.3,3,1.0,3.0,2625,איתי שושן,כרמל מרכזי טניס שולחן חיפה,1350.0,0,0.0,-2.0,745
3,281142,ליגת על גברים 2021-2022,2021-12-07,745,יונתן שוסטרמן,מ. בני הרצליה,1778.3,3,1.0,1.5,892,אדי ישראלי,הפועל חיפה,1099.8,0,0.0,-1.0,745
4,281146,ליגת על גברים 2021-2022,2021-12-07,745,יונתן שוסטרמן,מ. בני הרצליה,1778.3,0,1.0,-18.0,2284,איתי אביבי,הפועל חיפה,1522.8,3,0.0,27.0,2284
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27311,226495,ליגת ארצית גברים 2019-2020,2020-01-29,849,מקסים קוקוטוב,אליצור קריית אתא,668.6,2,0.0,-5.0,1532,אברהם שטרן,הפועל עירוני צפת,669.1,3,1.0,5.0,1532
27312,226497,ליגת ארצית גברים 2019-2020,2020-01-29,849,מקסים קוקוטוב,אליצור קריית אתא,668.6,0,0.0,-2.0,1439,שחר גילעד,הפועל עירוני צפת,928.9,3,1.0,2.0,1439
27313,225328,ליגת ארצית גברים 2019-2020,2020-01-22,849,מקסים קוקוטוב,אליצור קריית אתא,668.6,3,1.0,14.0,1735,סטניסלב בוייקנר,מכבי זאב זכרון יעקב,749.7,0,0.0,-14.0,849
27314,223806,ליגת ארצית גברים 2019-2020,2020-01-15,849,מקסים קוקוטוב,אליצור קריית אתא,668.6,3,0.0,7.0,1564,יבגני מקלר,הפועל חיפה,574.4,1,1.0,-7.0,849


In [12]:
#games_df = games_df.drop_duplicates('match_id')


In [13]:

games_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27316 entries, 0 to 27315
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   match_id          27316 non-null  object        
 1   match_type        27316 non-null  object        
 2   date              27316 non-null  datetime64[ns]
 3   p1_id             27316 non-null  object        
 4   p1_name           27316 non-null  object        
 5   p1_club           27316 non-null  object        
 6   p1_rank           27316 non-null  float64       
 7   p1_sets           27316 non-null  int64         
 8   p1_home           16508 non-null  float64       
 9   p1_points_gained  27316 non-null  float64       
 10  p2_id             27316 non-null  int64         
 11  p2_name           27316 non-null  object        
 12  p2_club           27316 non-null  object        
 13  p2_rank           27316 non-null  float64       
 14  p2_sets           2731

In [14]:
games_df.to_csv('games_table.csv',sep = ',',encoding = "utf-8-sig")