In [1]:
## Import Required Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
import joblib
import warnings
warnings.filterwarnings('ignore')

In [2]:

df = pd.read_csv('FMEL_Dataset.csv')  
print("Dataset shape:", df.shape)
print("\nFirst 5 rows:")
print(df.head())

Dataset shape: (37147, 10)

First 5 rows:
   id   season  division  round           localTeam         visitorTeam  \
0   1  1970-71         1      1  Atletico de Bilbao           Barcelona   
1   2  1970-71         1      1          Las Palmas  Atletico de Madrid   
2   3  1970-71         1      1         Real Madrid            Valencia   
3   4  1970-71         1      1       Celta de Vigo   Sporting de Gijon   
4   5  1970-71         1      1               Elche             Granada   

   localGoals  visitorGoals        date  timestamp  
0           1             1  12/09/1970   21938400  
1           1             1  12/09/1970   21938400  
2           2             0  12/09/1970   21938400  
3           2             0  13/09/1970   22024800  
4           1             1  13/09/1970   22024800  


In [3]:
print("\nDataset info:")
print(df.info())

print("\nMissing values:")
print(df.isnull().sum())


Dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37147 entries, 0 to 37146
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            37147 non-null  int64 
 1   season        37147 non-null  object
 2   division      37147 non-null  int64 
 3   round         37147 non-null  int64 
 4   localTeam     37147 non-null  object
 5   visitorTeam   37147 non-null  object
 6   localGoals    37147 non-null  int64 
 7   visitorGoals  37147 non-null  int64 
 8   date          37147 non-null  object
 9   timestamp     37147 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 2.8+ MB
None

Missing values:
id              0
season          0
division        0
round           0
localTeam       0
visitorTeam     0
localGoals      0
visitorGoals    0
date            0
timestamp       0
dtype: int64


In [4]:
## Data Preprocessing
# Handle missing values
df = df.dropna()


In [5]:
# Create target variable based on goals
def determine_winner(row):
    if row['localGoals'] > row['visitorGoals']:
        return 'Local Win'
    elif row['localGoals'] < row['visitorGoals']:
        return 'Visitor Win'
    else:
        return 'Draw'

In [6]:
df['match_result'] = df.apply(determine_winner, axis=1)

In [7]:

print("\nMatch results distribution:")
print(df['match_result'].value_counts())


Match results distribution:
match_result
Local Win      18787
Draw           10236
Visitor Win     8124
Name: count, dtype: int64


In [8]:
## Feature Engineering
# Encode categorical variables
le_local = LabelEncoder()
le_visitor = LabelEncoder()
le_season = LabelEncoder()
le_division = LabelEncoder()
le_round = LabelEncoder()

In [9]:
df['localTeam_encoded'] = le_local.fit_transform(df['localTeam'])
df['visitorTeam_encoded'] = le_visitor.fit_transform(df['visitorTeam'])
df['season_encoded'] = le_season.fit_transform(df['season'])
df['division_encoded'] = le_division.fit_transform(df['division'])
df['round_encoded'] = le_round.fit_transform(df['round'])

In [10]:
# Create additional features
df['goal_difference'] = df['localGoals'] - df['visitorGoals']
df['total_goals'] = df['localGoals'] + df['visitorGoals']

In [15]:
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y', errors='coerce')
# If that doesn't work, try automatic parsing with dayfirst=True
df['date'] = df['date'].fillna(pd.to_datetime(df['date'], dayfirst=True, errors='coerce'))
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek


In [17]:
# Calculate team statistics (historical performance)
def calculate_team_stats(df):
    team_stats = {}
    
    # Calculate stats for each team
    all_teams = list(set(df['localTeam'].unique()) | set(df['visitorTeam'].unique()))
    
    for team in all_teams:
        # Home games (when team plays at home)
        home_games = df[df['localTeam'] == team]
        home_wins = len(home_games[home_games['localGoals'] > home_games['visitorGoals']])
        home_draws = len(home_games[home_games['localGoals'] == home_games['visitorGoals']])
        
        # Away games (when team plays away)
        away_games = df[df['visitorTeam'] == team]
        away_wins = len(away_games[away_games['visitorGoals'] > away_games['localGoals']])
        away_draws = len(away_games[away_games['visitorGoals'] == away_games['localGoals']])
        
        total_games = len(home_games) + len(away_games)
        total_wins = home_wins + away_wins
        total_draws = home_draws + away_draws
        
        if total_games > 0:
            win_rate = total_wins / total_games
            draw_rate = total_draws / total_games
        else:
            win_rate = 0
            draw_rate = 0
        
        team_stats[team] = {
            'win_rate': win_rate,
            'draw_rate': draw_rate,
            'total_games': total_games
        }
    
    return team_stats

team_stats = calculate_team_stats(df)

In [18]:
# Add team statistics to dataframe
df['local_win_rate'] = df['localTeam'].map(lambda x: team_stats[x]['win_rate'])
df['visitor_win_rate'] = df['visitorTeam'].map(lambda x: team_stats[x]['win_rate'])
df['local_draw_rate'] = df['localTeam'].map(lambda x: team_stats[x]['draw_rate'])
df['visitor_draw_rate'] = df['visitorTeam'].map(lambda x: team_stats[x]['draw_rate'])

In [19]:
# Prepare features for training
features = [
    'localTeam_encoded', 'visitorTeam_encoded', 'season_encoded', 
    'division_encoded', 'round_encoded', 'month', 'day_of_week',
    'local_win_rate', 'visitor_win_rate', 'local_draw_rate', 'visitor_draw_rate'
]

X = df[features]
y = df['match_result']

print("\nFeatures shape:", X.shape)
print("Target distribution:")
print(y.value_counts())


Features shape: (37147, 11)
Target distribution:
match_result
Local Win      18787
Draw           10236
Visitor Win     8124
Name: count, dtype: int64


In [20]:
# Split data and train model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [21]:

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42, max_depth=10)
rf_model.fit(X_train, y_train)


In [22]:
# Make predictions
y_pred = rf_model.predict(X_test)

In [23]:
# Evaluate model
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.52

Classification Report:
              precision    recall  f1-score   support

        Draw       0.39      0.03      0.06      2047
   Local Win       0.52      0.96      0.68      3758
 Visitor Win       0.47      0.11      0.18      1625

    accuracy                           0.52      7430
   macro avg       0.46      0.37      0.30      7430
weighted avg       0.48      0.52      0.40      7430



In [24]:
## Feature importance
feature_importance = pd.DataFrame({
    'feature': features,
    'importance': rf_model.feature_importances_
}).sort_values('importance', ascending=False)

print("\nFeature Importance:")
print(feature_importance)


Feature Importance:
                feature  importance
2        season_encoded    0.183947
7        local_win_rate    0.149204
8      visitor_win_rate    0.139500
4         round_encoded    0.105477
9       local_draw_rate    0.096507
10    visitor_draw_rate    0.095293
0     localTeam_encoded    0.061990
5                 month    0.060008
1   visitorTeam_encoded    0.059529
6           day_of_week    0.034667
3      division_encoded    0.013879


In [25]:
# Save model and encoders
joblib.dump(rf_model, 'football_predictor_model.pkl')
joblib.dump(le_local, 'local_team_encoder.pkl')
joblib.dump(le_visitor, 'visitor_team_encoder.pkl')
joblib.dump(le_season, 'season_encoder.pkl')
joblib.dump(le_division, 'division_encoder.pkl')
joblib.dump(le_round, 'round_encoder.pkl')
joblib.dump(team_stats, 'team_statistics.pkl')


['team_statistics.pkl']