# Extracting from HTML


In [None]:
import pandas as pd
import os
import re # regex for extracting year from filename

# Mevcut √ßalƒ±≈üma dizenindeki t√ºm HTML dosyalarƒ±nƒ± bul
html_dosyalari = [f for f in os.listdir('.') if f.endswith('.html') and f.startswith('sezon')]

if not html_dosyalari:
    print("‚ùå Hata: √áalƒ±≈üma ortamƒ±nda 'sezon' ile ba≈ülayan HTML dosyasƒ± bulunamadƒ±.")
else:
    print(f"‚úÖ Bulunan HTML dosyalarƒ±: {html_dosyalari}")

    for dosya_adi in html_dosyalari:
        try:
            print(f"\n{dosya_adi} okunuyor...")
            # HTML dosyasƒ±ndan tabloyu okur (encoding ekleniyor)
            tablolar = pd.read_html(dosya_adi, encoding='utf-8')

            # ƒ∞lk tablo bizim fikst√ºr tablosudur
            df = tablolar[0]

            # Gereksiz ara ba≈ülƒ±klarƒ± temizle (Wk, Day vs tekrar edenleri)
            df = df[df['Wk'] != 'Wk']

            # Bo≈ü satƒ±rlarƒ± at (Bazen lig aralarƒ± bo≈ü satƒ±r olur)
            df = df.dropna(subset=['Home'])

            print("‚úÖ Tablo ba≈üarƒ±yla okundu!")
            print(f"Toplam Ma√ß Sayƒ±sƒ±: {len(df)}")

            # Hakem (Referee) s√ºtununu kontrol et
            if 'Referee' in df.columns:
                print("‚úÖ Hakem s√ºtunu mevcut!")
                # Sadece ihtiyacƒ±mƒ±z olan s√ºtunlarƒ± se√ßip kaydedelim
                df_clean = df[['Date', 'Home', 'Away', 'Score', 'Referee']]

                # Yƒ±l bilgisini dosya adƒ±ndan √ßek ve CSV adƒ±nƒ± olu≈ütur
                match = re.search(r'sezon(\d{2})_(\d{2})\.html', dosya_adi)
                if match:
                    year_part = match.group(1) + match.group(2)
                    csv_dosya_adi = f"clean_referee_{year_part}.csv"
                else:
                    csv_dosya_adi = f"clean_referee_{os.path.splitext(dosya_adi)[0]}.csv"

                # CSV olarak kaydederken encoding ekleniyor
                df_clean.to_csv(csv_dosya_adi, index=False, encoding='utf-8-sig') # Encoding changed back to utf-8
                print(f"üíæ '{csv_dosya_adi}' olarak kaydedildi.")
                print(df_clean.head())
            else:
                print("‚ö†Ô∏è Tabloda 'Referee' s√ºtunu bulunamadƒ±. Bu dosya i√ßin CSV kaydedilemedi.")

        except Exception as e:
            print(f"‚ùå '{dosya_adi}' i≈ülenirken bir hata olu≈ütu: {e}")

‚úÖ Bulunan HTML dosyalarƒ±: ['sezon20_21.html', 'sezon24_25.html', 'sezon21_22.html', 'sezon19_20.html', 'sezon17_18.html', 'sezon23_24.html', 'sezon18_19.html', 'sezon22_23.html']

sezon20_21.html okunuyor...
‚úÖ Tablo ba≈üarƒ±yla okundu!
Toplam Ma√ß Sayƒ±sƒ±: 420
‚úÖ Hakem s√ºtunu mevcut!
üíæ 'clean_referee_2021.csv' olarak kaydedildi.
         Date              Home           Away Score               Referee
0  2020-09-11          Rizespor     Fenerbah√ße   1‚Äì2        Arda Karde≈üler
1  2020-09-12  Fatih Karag√ºmr√ºk  Yeni Mal'spor   3‚Äì0    Abdulkadir Bitigen
2  2020-09-12         Sivasspor     Alanyaspor   0‚Äì2         Ali Palabƒ±yƒ±k
3  2020-09-12           G√∂ztepe    Denizlispor   5‚Äì1  Tugay Kaan Numanoƒülu
4  2020-09-12       Galatasaray   Gaziantep FK   3‚Äì1       Bahattin ≈ûim≈üek

sezon24_25.html okunuyor...
‚úÖ Tablo ba≈üarƒ±yla okundu!
Toplam Ma√ß Sayƒ±sƒ±: 342
‚úÖ Hakem s√ºtunu mevcut!
üíæ 'clean_referee_2425.csv' olarak kaydedildi.
         Date         Home  

# Combining Referee Data


In [None]:
import pandas as pd
import glob

# 'clean_referee_' ile ba≈ülayan t√ºm CSV dosyalarƒ±nƒ± bul
referee_files = glob.glob("clean_referee_*.csv")

if not referee_files:
    print("‚ùå Hata: 'clean_referee_' ile ba≈ülayan hi√ßbir CSV dosyasƒ± bulunamadƒ±.")
else:
    print(f"‚úÖ Bulunan hakem CSV dosyalarƒ±: {referee_files}")
    all_referee_data = []

    for file in referee_files:
        try:
            df = pd.read_csv(file, encoding='utf-8')
            all_referee_data.append(df)
        except Exception as e:
            print(f"‚ùå '{file}' okunurken bir hata olu≈ütu: {e}")

    if all_referee_data:
        # T√ºm hakem verilerini birle≈ütir
        combined_referee_df = pd.concat(all_referee_data, ignore_index=True)

        # Yeni CSV olarak kaydet
        output_file_name = "Clean_Referee_All_Seasons.csv"
        combined_referee_df.to_csv(output_file_name, index=False, encoding='utf-8-sig')
        print(f"\n‚úÖ T√ºm hakem verileri '{output_file_name}' olarak birle≈ütirildi ve kaydedildi.")
        print(combined_referee_df.head())
    else:
        print("‚ùå Hi√ßbir hakem verisi birle≈ütirilemedi.")

‚úÖ Bulunan hakem CSV dosyalarƒ±: ['clean_referee_2324.csv', 'clean_referee_2223.csv', 'clean_referee_2425.csv', 'clean_referee_2122.csv', 'clean_referee_1718.csv', 'clean_referee_1920.csv', 'clean_referee_2021.csv', 'clean_referee_1819.csv']

‚úÖ T√ºm hakem verileri 'Clean_Referee_All_Seasons.csv' olarak birle≈ütirildi ve kaydedildi.
         Date         Home          Away Score           Referee
0  2023-08-11  Trabzonspor   Antalyaspor   1‚Äì0      Zorbay K√º√ß√ºk
1  2023-08-12    Kasƒ±mpa≈üa    Ankarag√ºc√º   3‚Äì2   Bahattin ≈ûim≈üek
2  2023-08-12    Konyaspor  ƒ∞stanbulspor   1‚Äì1      Burak Pakkan
3  2023-08-12  Kayserispor   Galatasaray   0‚Äì0  Halil Umut Meler
4  2023-08-12   Pendikspor     Hatayspor   1‚Äì5      √áaƒüda≈ü Altay


# Clean_Stats_All_Season

In [None]:
import pandas as pd
import glob

# Klas√∂rdeki t√ºm T1 (stats) dosyalarƒ±nƒ± bul (√∂rn: stats_1718.csv, T1(17-18).csv)
dosyalar = glob.glob("*.csv") # Sen dosya adlarƒ±nƒ± 'stats_1718.csv' yapmƒ±≈ütƒ±n, ona g√∂re de ayarlayabilirsin

# Sadece ƒ∞htiyacƒ±mƒ±z Olan S√ºtunlar
gerekli_sutunlar = [
    'Date', 'HomeTeam', 'AwayTeam',  # Kimlik Bilgileri
    'FTHG', 'FTAG', 'FTR','HTHG','HTAG','HTR',           # Skorlar
    'HY', 'AY', 'HR', 'AR',          # KARTLAR (Ana Hedef)
    'HF', 'AF',                      # FAULLER (Tolerans Hesabƒ± ƒ∞√ßin)
    'HST', 'AST', 'HC', 'AC'         # BASKI (≈ûut ve Korner - Normalizasyon ƒ∞√ßin)
]

data_list = []

print("üßπ Veri Temizliƒüi Ba≈ülƒ±yor...")

for dosya in dosyalar:
    # Sadece istatistik dosyalarƒ±nƒ± okuyalƒ±m (Hakem dosyalarƒ± veya final dosyasƒ± karƒ±≈ümasƒ±n)
    # Eƒüer dosya adƒ±nda 'stats' veya 'T1' ge√ßiyorsa ve 'Final' ge√ßmiyorsa oku
    if ("stats" in dosya or "T1" in dosya) and "Final" not in dosya and "ref" not in dosya:
        try:
            print(f"okunuyor: {dosya}")
            df = pd.read_csv(dosya)

            # S√ºtun filtreleme: Sadece listedekiler varsa al
            mevcut_sutunlar = [col for col in gerekli_sutunlar if col in df.columns]
            df_clean = df[mevcut_sutunlar].copy()

            # Tarih D√ºzeltme (√áok √∂nemli, yoksa merge patlar)
            # football-data genelde DD/MM/YYYY kullanƒ±r
            # format parametresini siliyoruz, dayfirst=True ekliyoruz
            df_clean['Date'] = pd.to_datetime(df_clean['Date'], dayfirst=True, errors='coerce')

            data_list.append(df_clean)

        except Exception as e:
            print(f"‚ö†Ô∏è Hata ({dosya}): {e}")

if data_list:
    # T√ºm sezonlarƒ± alt alta birle≈ütir
    all_stats = pd.concat(data_list, ignore_index=True)

    # Kaydet
    all_stats.to_csv("Clean_Stats_All_Seasons.csv", index=False)
    print("\n‚úÖ Temizlenmi≈ü ƒ∞statistik Verisi Hazƒ±r: Clean_Stats_All_Seasons.csv")
    print(all_stats.head())
else:
    print("‚ùå Hi√ßbir istatistik dosyasƒ± bulunamadƒ±.")

üßπ Veri Temizliƒüi Ba≈ülƒ±yor...
okunuyor: T1(21-22).csv
okunuyor: T1(18-19).csv
okunuyor: T1(22-23).csv
okunuyor: T1(20-21).csv
okunuyor: T1(23-24).csv
okunuyor: T1(17-18).csv
okunuyor: T1(19-20).csv
okunuyor: T1(24-25).csv

‚úÖ Temizlenmi≈ü ƒ∞statistik Verisi Hazƒ±r: Clean_Stats_All_Seasons.csv
        Date    HomeTeam     AwayTeam  FTHG  FTAG FTR  HTHG  HTAG HTR   HY  \
0 2021-08-13    Besiktas     Rizespor     3     0   H   1.0   0.0   H  2.0   
1 2021-08-14  Karagumruk    Gaziantep     3     2   H   2.0   1.0   H  2.0   
2 2021-08-14       Altay  Kayserispor     3     0   H   2.0   0.0   H  0.0   
3 2021-08-14   Hatayspor    Kasimpasa     1     1   D   0.0   1.0   A  3.0   
4 2021-08-15  Buyuksehyr   Alanyaspor     0     1   A   0.0   1.0   A  0.0   

    AY   HR   AR    HF    AF  HST  AST   HC   AC  
0  1.0  0.0  0.0  23.0  16.0  6.0  0.0  4.0  5.0  
1  3.0  0.0  1.0  10.0  15.0  5.0  4.0  4.0  1.0  
2  0.0  0.0  0.0  15.0  10.0  7.0  3.0  2.0  4.0  
3  3.0  0.0  0.0  12.0  13.

  df_clean['Date'] = pd.to_datetime(df_clean['Date'], dayfirst=True, errors='coerce')


In [None]:
import pandas as pd

# all_stats DataFrame'i kernel state'inde mevcut olmalƒ±
# Eƒüer yeniden y√ºklemeniz gerekirse a≈üaƒüƒ±daki satƒ±rƒ± kullanabilirsiniz:
# all_stats = pd.read_csv('Clean_Stats_All_Seasons.csv')

# 'Date' s√ºtununu datetime formatƒ±na d√∂n√º≈üt√ºr (eƒüer hen√ºz d√∂n√º≈üt√ºr√ºlmediyse)
# Bir √∂nceki adƒ±mda d√∂n√º≈üt√ºr√ºld√ºƒü√º i√ßin bu satƒ±ra gerek kalmayabilir ancak g√ºvenlik i√ßin tutulabilir.
all_stats['Date'] = pd.to_datetime(all_stats['Date'])

# 2017 yƒ±lƒ±na ait satƒ±rlarƒ± filtrele
stats_2017 = all_stats[all_stats['Date'].dt.year == 2017]

# Sonu√ßlarƒ± g√∂ster
print("2017 yƒ±lƒ±na ait istatistikler:")
display(stats_2017.head())

2017 yƒ±lƒ±na ait istatistikler:


Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HY,AY,HR,AR,HF,AF,HST,AST,HC,AC
1828,2017-08-11,Buyuksehyr,Bursaspor,1,0,H,1.0,0.0,H,1.0,1.0,0.0,0.0,15.0,9.0,3.0,1.0,0.0,3.0
1829,2017-08-12,Akhisar Belediyespor,Sivasspor,1,0,H,0.0,0.0,D,3.0,2.0,0.0,0.0,17.0,15.0,3.0,1.0,3.0,4.0
1830,2017-08-12,Alanyaspor,Kasimpasa,1,3,A,1.0,2.0,A,2.0,4.0,0.0,0.0,12.0,23.0,7.0,3.0,5.0,3.0
1831,2017-08-12,Genclerbirligi,Karabukspor,1,1,D,1.0,1.0,D,2.0,3.0,0.0,0.0,20.0,19.0,3.0,1.0,4.0,1.0
1832,2017-08-12,Goztep,Fenerbahce,2,2,D,1.0,1.0,D,4.0,1.0,0.0,0.0,20.0,12.0,4.0,5.0,0.0,4.0


# Merging The Datasets and Feature Engineering :


In [None]:
import pandas as pd
import os

big_4 = ['Galatasaray', 'Fenerbah√ße', 'Be≈üikta≈ü', 'Trabzonspor']

print("üöÄ B√úY√úK Bƒ∞RLE≈ûTƒ∞RME BA≈ûLIYOR...\n")

try:
    # 1. Combined Dosyalarƒ± Oku
    df_stats = pd.read_csv("Clean_Stats_All_Seasons.csv")
    df_ref = pd.read_csv("Clean_Referee_All_Seasons.csv")

    # 2. Tarih Formatlarƒ±nƒ± E≈üitle
    df_stats['Date'] = pd.to_datetime(df_stats['Date'])
    df_ref['Date'] = pd.to_datetime(df_ref['Date'])

    # 3. Takƒ±m ƒ∞simlerini Temizle (E≈üle≈üme garantisi i√ßin)
    clean_map = {
        'Galatasaray SK': 'Galatasaray', 'Fenerbahce': 'Fenerbah√ße',
        'Besiktas': 'Be≈üikta≈ü', 'Besiktas JK': 'Be≈üikta≈ü',
        'Trabzonspor': 'Trabzonspor',
        'Buyuksehyr': 'Ba≈üak≈üehir', 'Medipol Basaksehir': 'Ba≈üak≈üehir', 'Basaksehir': 'Ba≈üak≈üehir',
        'Goztep': 'G√∂ztepe', 'Ad. Demirspor': 'Adana Demirspor', # New additions from combined stats
        'Kasimpasa': 'Kasƒ±mpa≈üa' # New addition from combined stats
    }
    df_stats['HomeTeam'] = df_stats['HomeTeam'].replace(clean_map)
    df_stats['AwayTeam'] = df_stats['AwayTeam'].replace(clean_map)
    df_ref['Home'] = df_ref['Home'].replace(clean_map)
    df_ref['Away'] = df_ref['Away'].replace(clean_map)

    # 4. MERGE (Birle≈ütirme)
    master_df = pd.merge(
        df_stats,
        df_ref,
        left_on=['Date', 'HomeTeam'],
        right_on=['Date', 'Home'],
        how='inner'
    )

    # Gereksiz tekrar s√ºtununu sil
    if 'Home' in master_df.columns: master_df.drop(columns=['Home'], inplace=True)

    print(f"‚úÖ Veriler ba≈üarƒ±yla birle≈ütirildi: {len(master_df)} ma√ß")

    # 5. 'Season' S√ºtununu T√ºret
    # Futbol sezonlarƒ± genellikle Aƒüustos'ta ba≈ülar, Mayƒ±s'ta biter.
    # √ñrn: 2023-08-11 -> 2324, 2024-05-15 -> 2324, 2024-08-09 -> 2425
    year_start = master_df['Date'].dt.year
    year_end = master_df['Date'].dt.year + 1

    master_df.loc[master_df['Date'].dt.month < 8, 'Season'] = (year_start - 1).astype(str).str[2:] + year_start.astype(str).str[2:]
    master_df.loc[master_df['Date'].dt.month >= 8, 'Season'] = year_start.astype(str).str[2:] + year_end.astype(str).str[2:]
    master_df['Season'] = master_df['Season'].astype(str) # Ensure it's string type

    # 6. 'hasfans' S√ºtununu Ekle
    master_df['hasfans'] = 1 # Varsayƒ±lan olarak seyircili

    # 19-20 sezonunda 17-03-2020'den sonraki ma√ßlar seyircisiz
    master_df.loc[
        (master_df['Season'] == '1920') & (master_df['Date'] > '2020-03-17'),
        'hasfans'
    ] = 0

    # 20-21 sezonunun tamamƒ± seyircisiz
    master_df.loc[
        (master_df['Season'] == '2021'),
        'hasfans'
    ] = 0

except Exception as e:
    print(f"‚ùå Veri birle≈ütirilirken bir hata olu≈ütu: {e}")
    exit()

# --- FEATURE ENGINEERING (ANALƒ∞Z ƒ∞√áƒ∞N S√úTUNLAR) ---

# 1. Big 4 ƒ∞≈üaretlemesi
master_df['Home_is_Big4'] = master_df['HomeTeam'].apply(lambda x: 1 if x in big_4 else 0)
master_df['Away_is_Big4'] = master_df['AwayTeam'].apply(lambda x: 1 if x in big_4 else 0)

# 2. Tolerans Endeksi (Faul / Kart)
# Kart sayƒ±larƒ± (Sarƒ± + Kƒ±rmƒ±zƒ±)
master_df['Home_Total_Cards'] = master_df['HY'] + master_df['HR']
master_df['Away_Total_Cards'] = master_df['AY'] + master_df['AR']

# Sƒ±fƒ±ra b√∂l√ºnmeyi √∂nlemek i√ßin, kart yoksa tolerans = faul sayƒ±sƒ±
master_df['Home_Foul_Tolerance'] = master_df.apply(
    lambda x: x['HF'] / x['Home_Total_Cards'] if x['Home_Total_Cards'] > 0 else x['HF'], axis=1
)
master_df['Away_Foul_Tolerance'] = master_df.apply(
    lambda x: x['AF'] / x['Away_Total_Cards'] if x['Away_Total_Cards'] > 0 else x['AF'], axis=1
)

# 3. Baskƒ± Endeksi (≈ûut + Korner)
# HST (ƒ∞sabetli ≈ûut) varsa onu kullan, yoksa HS (Toplam ≈ûut)
if 'HST' in master_df.columns:
    master_df['Home_Pressure'] = master_df['HST'] + master_df['HC']
    master_df['Away_Pressure'] = master_df['AST'] + master_df['AC']
else:
    # This 'else' block might be removed if HST/AST are always present after cleaning
    # but kept for robustness if source files change
    master_df['Home_Pressure'] = master_df['HS'] + master_df['HC']
    master_df['Away_Pressure'] = master_df['AS'] + master_df['AC']

# --- KRONOLOJƒ∞K SIRALAMA ---
master_df = master_df.sort_values(by='Date').reset_index(drop=True)

# --- KAYDET ---
output_name = "TURKISH_SUPER_LIG_FULL_DATASET.csv"
master_df.to_csv(output_name, index=False, encoding='utf-8-sig')
print(f"\nüéâ ƒ∞≈ûLEM TAMAM! Master dosya olu≈üturuldu: {output_name}")
print("S√ºtunlar:", master_df.columns.tolist())

üöÄ B√úY√úK Bƒ∞RLE≈ûTƒ∞RME BA≈ûLIYOR...

‚úÖ Veriler ba≈üarƒ±yla birle≈ütirildi: 2183 ma√ß

üéâ ƒ∞≈ûLEM TAMAM! Master dosya olu≈üturuldu: TURKISH_SUPER_LIG_FULL_DATASET.csv
S√ºtunlar: ['Date', 'HomeTeam', 'AwayTeam', 'FTHG', 'FTAG', 'FTR', 'HTHG', 'HTAG', 'HTR', 'HY', 'AY', 'HR', 'AR', 'HF', 'AF', 'HST', 'AST', 'HC', 'AC', 'Away', 'Score', 'Referee', 'Season', 'hasfans', 'Home_is_Big4', 'Away_is_Big4', 'Home_Total_Cards', 'Away_Total_Cards', 'Home_Foul_Tolerance', 'Away_Foul_Tolerance', 'Home_Pressure', 'Away_Pressure']


In [None]:
import pandas as pd

# Olu≈üturulan CSV dosyasƒ±nƒ± oku
full_data = pd.read_csv('TURKISH_SUPER_LIG_FULL_DATASET.csv')

# ƒ∞lk 5 satƒ±rƒ± g√∂ster
display(full_data.head())

Unnamed: 0,Date,HomeTeam,AwayTeam,FTHG,FTAG,FTR,HTHG,HTAG,HTR,HY,...,Season,hasfans,Home_is_Big4,Away_is_Big4,Home_Total_Cards,Away_Total_Cards,Home_Foul_Tolerance,Away_Foul_Tolerance,Home_Pressure,Away_Pressure
0,2017-08-11,Ba≈üak≈üehir,Bursaspor,1,0,H,1.0,0.0,H,1.0,...,1718,1,0,0,1.0,1.0,15.0,9.0,3.0,4.0
1,2017-08-12,Alanyaspor,Kasƒ±mpa≈üa,1,3,A,1.0,2.0,A,2.0,...,1718,1,0,0,2.0,4.0,6.0,5.75,12.0,6.0
2,2017-08-12,G√∂ztepe,Fenerbah√ße,2,2,D,1.0,1.0,D,4.0,...,1718,1,0,1,4.0,1.0,5.0,12.0,4.0,9.0
3,2017-08-13,Be≈üikta≈ü,Antalyaspor,2,0,H,1.0,0.0,H,3.0,...,1718,1,1,0,3.0,4.0,7.666667,4.0,8.0,5.0
4,2017-08-13,Trabzonspor,Konyaspor,2,1,H,1.0,1.0,D,4.0,...,1718,1,1,0,4.0,2.0,5.0,10.0,7.0,10.0


# Cleaning and Handling the Missing Matches


In [None]:
import pandas as pd

# 1. Mevcut Master Dosyayƒ± Oku
df = pd.read_csv("TURKISH_SUPER_LIG_FULL_DATASET.csv")

print(f"Temizlik √ñncesi Ma√ß Sayƒ±sƒ±: {len(df)}")

# 2. Eksik Verileri Sil (Dropna)
# Hakem veya Sarƒ± Kart verisi olmayan ma√ß, analiz edilemez ma√ßtƒ±r.
# 'subset' parametresi ile sadece kritik s√ºtunlara bakƒ±yoruz.
df_clean = df.dropna(subset=['Referee', 'HY', 'AY', 'HF', 'AF'])

print(f"Temizlik Sonrasƒ± Ma√ß Sayƒ±sƒ±: {len(df_clean)}")
print(f"Silinen Ma√ß Sayƒ±sƒ±: {len(df) - len(df_clean)}")

# 3. Kontrol: Hatayspor'un 2023 Mart sonrasƒ±ndaki ma√ßlarƒ± gitti mi?
hatay_check = df_clean[
    (df_clean['Season'] == '2223') &
    ((df_clean['HomeTeam'] == 'Hatayspor') | (df_clean['AwayTeam'] == 'Hatayspor')) &
    (df_clean['Date'] > '2023-02-06')
]

if len(hatay_check) == 0:
    print("‚úÖ Deprem sonrasƒ± oynanmayan ma√ßlar ba≈üarƒ±yla temizlendi.")
else:
    print(f"‚ö†Ô∏è Dikkat: Hala {len(hatay_check)} adet ≈ü√ºpheli Hatayspor ma√ßƒ± var.")

# 4. Temiz Dosyayƒ± Kaydet (√úzerine yazabiliriz veya yeni isim verebiliriz)
df_clean.to_csv("TURKISH_SUPER_LIG_FULL_DATASET.csv", index=False, encoding='utf-8-sig')
print("üíæ Dosya g√ºncellendi ve kaydedildi.")

Temizlik √ñncesi Ma√ß Sayƒ±sƒ±: 2183
Temizlik Sonrasƒ± Ma√ß Sayƒ±sƒ±: 2162
Silinen Ma√ß Sayƒ±sƒ±: 21
‚úÖ Deprem sonrasƒ± oynanmayan ma√ßlar ba≈üarƒ±yla temizlendi.
üíæ Dosya g√ºncellendi ve kaydedildi.


# Current Rankings Included

In [None]:
import pandas as pd
import numpy as np
from functools import cmp_to_key

# 1. Load Data
df = pd.read_csv('TURKISH_SUPER_LIG_FULL_DATASET.csv')
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values(by=['Season', 'Date'])

# Initialize columns with NaN (this will make them float initially)
df['Home_Rank'] = np.nan
df['Away_Rank'] = np.nan

def get_points(ftr, team_type):
    if ftr == 'D':
        return 1
    if team_type == 'Home' and ftr == 'H':
        return 3
    if team_type == 'Away' and ftr == 'A':
        return 3
    return 0

# Custom Comparator for Sorting
def compare_teams(t1, t2, standings, h2h_records):
    # 1. Points
    if standings[t1]['Points'] != standings[t2]['Points']:
        return standings[t1]['Points'] - standings[t2]['Points']

    # 2. Goal Difference (Averaj) - User's explicit preference
    if standings[t1]['GD'] != standings[t2]['GD']:
        return standings[t1]['GD'] - standings[t2]['GD']

    # 3. Head-to-Head (Aralarƒ±ndaki Ma√ß)
    # Calculate points earned in matches between t1 and t2
    t1_h2h_pts = 0
    t2_h2h_pts = 0

    # Check if they played
    pair = tuple(sorted((t1, t2)))
    if pair in h2h_records:
        matches = h2h_records[pair]
        for m in matches:
            # m is {'Home': 'TeamA', 'Away': 'TeamB', 'FTR': 'H/A/D'}
            # Determine who was home/away relative to t1, t2
            if m['Home'] == t1: # t1 was Home, t2 was Away
                t1_h2h_pts += get_points(m['FTR'], 'Home')
                t2_h2h_pts += get_points(m['FTR'], 'Away')
            else: # t2 was Home, t1 was Away
                t2_h2h_pts += get_points(m['FTR'], 'Home')
                t1_h2h_pts += get_points(m['FTR'], 'Away')

    if t1_h2h_pts != t2_h2h_pts:
        return t1_h2h_pts - t2_h2h_pts

    # 4. Goals For (Atƒ±lan Gol) - Final tie breaker usually
    return standings[t1]['GF'] - standings[t2]['GF']


# Process each season
for season in df['Season'].unique():
    season_df = df[df['Season'] == season]

    # Get all teams in this season
    teams = set(season_df['HomeTeam']).union(set(season_df['AwayTeam']))
    num_teams = len(teams)

    # Init Standings
    standings = {team: {'Points': 0, 'GD': 0, 'GF': 0, 'Played': 0} for team in teams}

    # Init H2H storage: Key=(TeamA, TeamB) sorted tuple, Value=List of match dicts
    h2h_records = {}

    # Iterate through matches
    for idx, row in season_df.iterrows():
        home, away = row['HomeTeam'], row['AwayTeam']

        # Check if it's Second Half
        # Ranks are calculated only for second half matches based on original code's intent.
        is_second_half = (standings[home]['Played'] >= (num_teams - 1))

        if is_second_half:
            # Calculate Ranks
            current_teams = list(teams)
            sorter = cmp_to_key(lambda t1, t2: compare_teams(t1, t2, standings, h2h_records))
            current_teams.sort(key=sorter, reverse=True)

            h_rank = current_teams.index(home) + 1
            a_rank = current_teams.index(away) + 1

            # Assign to main DataFrame
            df.at[idx, 'Home_Rank'] = h_rank
            df.at[idx, 'Away_Rank'] = a_rank

        # --- Update Standings & H2H AFTER the match ---
        hg, ag = row['FTHG'], row['FTAG']
        ftr = row['FTR']

        # Update Stats
        standings[home]['Played'] += 1
        standings[away]['Played'] += 1
        standings[home]['GF'] += hg
        standings[away]['GF'] += ag
        standings[home]['GD'] += (hg - ag)
        standings[away]['GD'] += (ag - hg)

        if ftr == 'H':
            standings[home]['Points'] += 3
        elif ftr == 'A':
            standings[away]['Points'] += 3
        else:
            standings[home]['Points'] += 1
            standings[away]['Points'] += 1

        # Update H2H
        pair = tuple(sorted((home, away)))
        if pair not in h2h_records:
            h2h_records[pair] = []
        h2h_records[pair].append({
            'Home': home,
            'Away': away,
            'FTR': ftr
        })

# After the loop, convert rank columns to integer type (nullable).
# NaN values (from first-half matches) will remain NaN with 'Int64' dtype.
df['Home_Rank'] = df['Home_Rank'].astype('Int64')
df['Away_Rank'] = df['Away_Rank'].astype('Int64')

# Save to CSV
output_filename = 'TURKISH_SUPER_LIG_FULL_DATASET_WITH_RANKS.csv'
df.to_csv(output_filename, index=False, encoding='utf-8-sig')

print(f"File saved to {output_filename}")
print(df[['Date', 'HomeTeam', 'Home_Rank', 'AwayTeam', 'Away_Rank']].head())