In [1]:
import pandas as pd
import sqlite3
import re
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
#incarcam datele
results = pd.read_csv("results.csv")

In [3]:
def connect_to_database(database_name):
    conn = sqlite3.connect(database_name)
    cursor = conn.cursor()
    return conn, cursor

conn, cursor = connect_to_database('football.db')

results.to_sql('results',conn, if_exists='replace', index=False)

# Adăugare coloane 'day', 'month' și 'year' în tabela 'results', dacă nu există deja
cursor.execute("""
    PRAGMA table_info(results);
""")
columns = cursor.fetchall()
column_names = [column[1] for column in columns]
if 'day' not in column_names:
    cursor.execute("""
        ALTER TABLE results
        ADD COLUMN day INTEGER;
    """)
if 'month' not in column_names:
    cursor.execute("""
        ALTER TABLE results
        ADD COLUMN month INTEGER;
    """)
if 'year' not in column_names:
    cursor.execute("""
        ALTER TABLE results
        ADD COLUMN year INTEGER;
    """)

conn.commit()

conn.close()

In [None]:
def extract_date_info(date_str):
    try:
        month, day, year = map(int, date_str.split('/'))
        if year >= 1900:
            return day, month, year
    except ValueError:
        pass

    match = re.match(r'(\d{1,2})-(\d{1,2})-(\d{4})', date_str)
    if match:
        groups = match.groups()
        month, day, year = map(int, (groups[0], groups[1], groups[2]))
        return day, month, year

    match = re.match(r'(\d{4})-(\d{2})-(\d{2})', date_str)
    if match:
        groups = match.groups()
        year, month, day = map(int, (groups[0], groups[1], groups[2]))
        return day, month, year

    return None, None, None

conn = sqlite3.connect('football.db')
cursor = conn.cursor()

# Actualizez valorile pentru coloanele 'day', 'month', 'year' folosind datele extrase
cursor.execute("SELECT date FROM results;")
dates = cursor.fetchall()

for date_row in dates:
    date_str = date_row[0]
    day, month, year = extract_date_info(date_str)
    cursor.execute("UPDATE results SET day=?, month=?, year=? WHERE date=?", (day, month, year, date_str))

conn.commit() 

conn.close()

In [None]:
conn = sqlite3.connect('football.db')

df_results = pd.read_sql('SELECT * FROM results', conn)

def assign_unique_codes(df, column, start_code):
    unique_codes = {}
    code = start_code
    for value in df[column].unique():
        if pd.notnull(value):  # Ignorăm valorile NaN
            unique_codes[value] = code
            code += 1  # Incrementăm pt val unica
    return unique_codes, code # Returnam si ultimul cod utilizat

# Atribuim coduri unice pentru coloana 'home_team', incepand de la 100
unique_codes_home_team, code = assign_unique_codes(df_results, 'home_team', 100)

# Aplicăm aceleași coduri unice 
df_results['home_team_code'] = df_results['home_team'].map(unique_codes_home_team)
df_results['away_team_code'] = df_results['away_team'].map(unique_codes_home_team)
df_results['country_code'] = df_results['country'].map(unique_codes_home_team)

# Remediem valorile de tip float din 'away_team_code' și 'country_code' folosind fillna()
df_results['away_team_code'] = df_results['away_team_code'].fillna(df_results['home_team_code'])
df_results['country_code'] = df_results['country_code'].fillna(df_results['home_team_code'])

df_results['away_team_code'] = df_results['away_team_code'].astype(int)
df_results['country_code'] = df_results['country_code'].astype(int)

# Atribuim coduri unice pentru coloana 'tournament', pornind de la valoarea maximă existentă
unique_codes_tournament, code = assign_unique_codes(df_results, 'tournament', code)
df_results['tournament_code'] = df_results['tournament'].map(unique_codes_tournament)

# Inițializăm coloana 'city_code' cu valori nule
df_results['city_code'] = np.nan

# acelasi lucru cu pornirea de la valoarea maxima existenta
unique_codes_city, _ = assign_unique_codes(df_results, 'city', code)
df_results['city_code'] = df_results['city'].map(unique_codes_city)

df_results.to_sql('results', conn, if_exists='replace', index=False)

conn.close()

In [None]:
selected_columns = ['home_team_code','away_team_code','home_score','away_score','country_code','tournament_code','city_code']
# Facem un subset-ul doar cu aceste coloane
selected_df = df_results[selected_columns]

correlation_matrix = selected_df.corr()
plt.figure(figsize=(8, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Matrice de corelație results')
plt.show()

In [None]:
# Scorurile echipelor
home_scores = df_results['home_score']
away_scores = df_results['away_score']

# Calculăm Indicele Gini pentru echipele gazdă și oaspete folosind doar scorurile
gini_home = 1 - (2 * (home_scores.value_counts(normalize=True) ** 2).sum())
gini_away = 1 - (2 * (away_scores.value_counts(normalize=True) ** 2).sum())

print("Indicele Gini pentru scorul echipei gazde:", gini_home)
print("Indicele Gini pentru scorul echipei oaspete:", gini_away)

In [None]:
indices = ['Gazdă', 'Oaspeți']
values = [gini_home, gini_away]

plt.figure(figsize=(8, 6))
plt.bar(indices, values, color=['blue', 'green'])
plt.title('Gini Index pentru scorurile echipelor')
plt.xlabel('Echipă')
plt.ylabel('Gini Index')
plt.ylim(0, 1)  # Setarea limitelor axei y între 0 și 1 pentru claritate
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

In [None]:
# Calculăm Indicele Gini pentru țara gazdă
gini_country = 1 - (2 * (df_results['country'].value_counts(normalize=True) ** 2).sum())

# Calculăm Indicele Gini pentru turneu
gini_tournament = 1 - (2 * (df_results['tournament'].value_counts(normalize=True) ** 2).sum())

# Calculăm Indicele Gini pentru oraș
gini_city = 1 - (2 * (df_results['city'].value_counts(normalize=True) ** 2).sum())

# Pentru neutral
gini_neutral = 1 - (2 * (df_results['neutral'].value_counts(normalize=True) ** 2).sum())

print("Indicele Gini pentru țara gazdă:", gini_country)
print("Indicele Gini pentru turneu:", gini_tournament)
print("Indicele Gini pentru oraș:", gini_city)

In [None]:
# Indici și etichete
indices = ['Country', 'Tournament', 'City']
values = [gini_country, gini_tournament, gini_city]

# Curba lui Lorenz
plt.figure(figsize=(8, 6))
plt.plot(np.linspace(0, 1, len(values)), np.linspace(0, 1, len(values)), label='Lorenz Curve', color='red', linestyle='--')

# Punctele pentru Gini Index
plt.scatter(np.cumsum(sorted(values)) / np.sum(values), np.linspace(0, 1, len(values)), label='Gini Index', color='blue')

# Etichete
for i, txt in enumerate(indices):
    plt.annotate(txt, (np.cumsum(sorted(values))[i] / np.sum(values), np.linspace(0, 1, len(values))[i]), xytext=(-20, 10), textcoords='offset points')

plt.title('Curba lui Lorenz pentru Indicele Gini')
plt.xlabel('Fracția cumulată a eșantioanelor')
plt.ylabel('Fracția cumulată a variabilei (Gini Index)')
plt.legend()
plt.grid(True)
plt.show()

In [None]:

home_scores = df_results['home_score']

# Calculăm distribuția de frecvență 
frequency_distribution = home_scores.value_counts(normalize=True)

# Calculăm entropia 
entropy_home_scores = -np.sum(frequency_distribution * np.log2(frequency_distribution))

print("Entropia pentru scorurile echipei gazdă:", entropy_home_scores)


In [None]:
away_scores = df_results['away_score']

frequency_distribution = away_scores.value_counts(normalize=True)

entropy_away_scores = -np.sum(frequency_distribution * np.log2(frequency_distribution))

print("Entropia pentru scorurile echipei oaspete:", entropy_away_scores)

In [None]:
tournaments = df_results['tournament']

frequency_distribution = tournaments.value_counts(normalize=True)

entropy_tournaments = -np.sum(frequency_distribution * np.log2(frequency_distribution))

print("Entropia pentru turnee:", entropy_tournaments)

In [None]:
neutral = df_results['neutral']

frequency_distribution = neutral.value_counts(normalize=True)

# Entropia 
entropy_neutral = -np.sum(frequency_distribution * np.log2(frequency_distribution))

print("Entropia pentru teren neutru:", entropy_neutral)

In [None]:
variabile = ['Tournament', 'Away score', 'Home Score', 'Neutral']
entropii = [entropy_home_scores,entropy_away_scores,entropy_tournaments,entropy_neutral]

plt.figure(figsize=(10, 6))
plt.bar(variabile, entropii, color='skyblue')

plt.title('Entropia pentru diferite variabile')
plt.xlabel('Variabilă')
plt.ylabel('Entropie')
plt.xticks(rotation=45)  # Rotim etichetele axei x pentru a le face mai ușor de citit
plt.grid(axis='y', linestyle='--', alpha=0.7)

# Afișăm graficul
plt.show()

In [None]:
# MODELUL KNN
# Adăugăm o nouă coloană în dataframe care reprezintă scorul total al meciului
df_results['total_score'] = df_results['home_score'] + df_results['away_score']

# Definim caracteristicile (X) și variabila țintă (y)
X = df_results[['day', 'month', 'year', 'home_team_code', 'away_team_code', 'home_score', 'away_score', 'tournament_code', 'country_code']]
y = df_results['total_score']

# Împărțim datele în set de antrenare și set de testare
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definim grid-ul de hiperparametrii pe care dorim să-l explorăm
param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]
}

# Inițializăm modelul KNN
knn_model = KNeighborsClassifier()

# Inițializăm căutarea grid
grid_search = GridSearchCV(knn_model, param_grid, cv=5, scoring='accuracy')

# Antrenăm căutarea grid pe datele de antrenare
grid_search.fit(X_train, y_train)

# Afișăm cea mai bună combinație de hiperparametrii
print("Cea mai bună combinație de hiperparametrii:", grid_search.best_params_)

# Obținem modelul cu cei mai buni hiperparametrii
best_knn_model = grid_search.best_estimator_

# Facem predicții pe datele de testare
y_pred = best_knn_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Acuratețea modelului KNN cu cei mai buni hiperparametrii:", accuracy)

mse = mean_squared_error(y_test, y_pred)

mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)


In [None]:
# MODELUL Random Forest
df_results['total_score'] = df_results['home_score'] + df_results['away_score']

X = df_results[['day', 'month', 'year', 'home_team_code', 'away_team_code','tournament_code', 'country_code']]
y = df_results['total_score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Inițializăm modelul Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

mse_rf = mean_squared_error(y_test, y_pred_rf)
mae_rf = mean_absolute_error(y_test, y_pred_rf)

print("Mean Squared Error (Random Forest):", mse_rf)
print("Mean Absolute Error (Random Forest):", mae_rf)

# introducem codurile unice
input_data = {
    'home_team_code': 245,  
    'away_team_code': 211,  
    'day': 10,
    'month': 3,
    'year': 2024,  
    'tournament_code': 458,  
    'country_code': 245  
}

# Verificați dacă toate codurile introduse sunt valide
if None in [input_data['home_team_code'], input_data['away_team_code'], input_data['tournament_code'], input_data['country_code']]:
    print("Una dintre informațiile introduse nu are un cod unic asociat!")
else:
    # Folosim codurile unice și alte informații pentru a construi setul de date de intrare
    X_input = np.array([
        [input_data['day'], input_data['month'], input_data['year'],
         input_data['home_team_code'], input_data['away_team_code'],
         input_data['home_score'], input_data['away_score'],
         input_data['tournament_code'], input_data['country_code']]
    ])

    predicted_score = rf_model.predict(X_input)

    home_score_pred = int(predicted_score[0]) // 10  
    away_score_pred = int(predicted_score[0]) % 10   
    
    # Afișați scorul prezis în consolă sub forma dorită
    print("Scorul prezis pentru meciul este:", home_score_pred, "-", away_score_pred)




In [None]:
# MODELUL Neural Network Regression
input_data = {
    'home_team_code': 245,  
    'away_team_code': 211, 
    'day': 10,
    'month': 3,
    'year': 2024, 
    'tournament_code': 458,  
    'country_code': 245  
}

X_input = np.array([
    [
     input_data['home_team_code'], input_data['away_team_code'],
        input_data['day'], input_data['month'], input_data['year'],
     input_data['tournament_code'], input_data['country_code']]
])

X = df_results[['home_team_code', 'away_team_code','day','month','year', 'tournament_code', 'country_code']]
y1 = df_results['home_score']
y2= df_results['away_score']

X_train, X_test, y_train1, y_test1 = train_test_split(X, y1, test_size=0.2, random_state=42)
X_train, X_test, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42)

# Inițializăm modelul Neural Network Regression
nn_model = MLPRegressor(hidden_layer_sizes=(100, 50), activation='relu', solver='adam', max_iter=500, random_state=42)

nn_model.fit(X_train, y_train1)

y_pred1 = nn_model.predict(X_test)

#mse = mean_squared_error(y_test1, y_pred1)
print("Mean Squared Error (MSE) pentru Neural Network Regression:", mse)

print(nn_model.predict(X_input))
#print(accuracy_score(y_pred1,y_test1))

nn_model.fit(X_train, y_train2)

y_pred2 = nn_model.predict(X_test)

#mse = mean_squared_error(y_test2, y_pred2)
print("Mean Squared Error (MSE) pentru Neural Network Regression:", mse)

print(nn_model.predict(X_input))
#print(accuracy_score(y_pred2, y_test2))

mse1 = mean_squared_error(y_test1, y_pred1)
mse2 = mean_squared_error(y_test2, y_pred2)


print("Mean Squared Error (MSE) for Home Score prediction:", mse1)
print("Mean Squared Error (MSE) for Away Score prediction:", mse2)

rounded_pred1 = np.round(y_pred1)
rounded_pred2 = np.round(y_pred2)

accuracy1 = accuracy_score(y_test1, rounded_pred1)
accuracy2 = accuracy_score(y_test2, rounded_pred2)

print("Accuracy for Home Score prediction:", accuracy1)
print("Accuracy for Away Score prediction:", accuracy2)

In [None]:
#XGBoost Algorithm

input_data = {
    'home_team_code': 108,
    'away_team_code': 236,
    'day': 3,
    'month': 12,
    'year': 2022,
    'tournament_code': 444,
    'country_code': 278,
    'neutral': 1
}

X_input = np.array([
    [
        input_data['home_team_code'], input_data['away_team_code'],
        input_data['day'], input_data['month'], input_data['year'],
        input_data['tournament_code'], input_data['country_code'],
        input_data['neutral']
    ]
])

X = df_results[['home_team_code', 'away_team_code','day','month','year', 'tournament_code', 'country_code','neutral']]
y1 = df_results['home_score']
y2= df_results['away_score']

X_train, X_test, y_train1, y_test1 = train_test_split(X, y1, test_size=0.2, random_state=42)
X_train, X_test, y_train2, y_test2 = train_test_split(X, y2, test_size=0.2, random_state=42)

# Define XGBoost 
model = xgb.XGBRegressor(objective='reg:squarederror', max_depth=5, n_estimators=100)

# pentru home_score
model.fit(X_train, y_train1)
y_pred1 = model.predict(X_test)
# Predictie pentru input
home_score_pred = model.predict(X_input)[0]

mse1 = mean_squared_error(y_test1, y_pred1)
print("Mean Squared Error (MSE) pentru Neural Network Regression:", mse1)

# pentru away_score
model.fit(X_train, y_train2)
y_pred2 = model.predict(X_test)
# Predictie pentru input
away_score_pred = model.predict(X_input)[0]

mse2 = mean_squared_error(y_test2, y_pred2)
print("Mean Squared Error (MSE) pentru Neural Network Regression:", mse2)

# Print scoruri înainte de rotunjire
print(f"Scoruri înainte de rotunjire: Home: {home_score_pred}, Away: {away_score_pred}")


# Rotunjire conform regulilor specificate
def custom_round(value):
    fractional_part = value - int(value)
    if fractional_part < 0.5:
        return int(np.floor(value))
    else:
        return int(np.ceil(value))

home_score_rounded = custom_round(home_score_pred)
away_score_rounded = custom_round(away_score_pred)

# Afisare scoruri in formatul dorit
print(f"{home_score_rounded} - {away_score_rounded}")

rounded_pred1 = np.round(y_pred1)
rounded_pred2 = np.round(y_pred2)

accuracy1 = accuracy_score(y_test1, rounded_pred1)
accuracy2 = accuracy_score(y_test2, rounded_pred2)

print("Accuracy for Home Score prediction:", accuracy1)
print("Accuracy for Away Score prediction:", accuracy2)
