In [17]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [18]:
import pandas as pd
import os
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier



In [19]:
# List files in the directory
files = os.listdir('/content/drive/MyDrive/Match prediction/Dataset')

# Print the file names
print(files)


['Match 2022.xlsx', 'Match 2023.xlsx', 'Match 2024.xlsx']


In [20]:
# Read data
df = pd.concat([pd.read_excel(f"/content/drive/MyDrive/Match prediction/Dataset/{file}", index_col =0) for file in files])


In [21]:
df.head()

Unnamed: 0_level_0,Time,League,Country,Home Team,Away Team,Goals Home,Goals Away,Odd Home,Odd Draw,Odd Away,Best Tip,Best Tip Trust,Best Tip Odd,Underover,Trust Underover,Odd Underover,Final Result,Trust Final Result,tipOdd
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2022-01-16,00:00,Liga Nacional,Guatemala,Antigua GFC,Achuapa,1,0,1.3,4.35,7.0,1,83,1.3,under 2.5,33.8463,1.8,1,83.3333,1.3
2022-01-16,01:00,Liga MX,Mexico,Tigres UANL,Puebla,0,2,1.58,3.8,5.5,1,84,1.58,under 2.5,13.0,1.73,1,84.781,1.58
2022-01-16,03:00,Liga MX,Mexico,Cruz Azul,FC Juarez,1,0,1.58,3.8,5.5,over 1.5,66,1.39,over 1.5,66.201,1.39,1,40.7193,1.58
2022-01-16,03:06,Liga MX,Mexico,Club Tijuana,Leon,1,1,2.65,3.2,2.57,X2,51,1.41,under 2.5,22.3168,1.69,2,39.3339,2.57
2022-01-16,10:00,TFF 2. Lig,Turkey,Afjet Afyonspor,BB Bodrumspor,0,2,1.65,3.75,3.9,1,56,1.65,over 2.5,53.047,1.75,1,56.8148,1.65


In [22]:
df.shape

(93492, 19)

In [23]:
print(df.columns)


Index(['Time', 'League', 'Country', 'Home Team', 'Away Team', 'Goals Home',
       'Goals Away', 'Odd Home', 'Odd Draw', 'Odd Away', 'Best Tip',
       'Best Tip Trust', 'Best Tip Odd', 'Underover', 'Trust Underover',
       'Odd Underover', 'Final Result', 'Trust Final Result', 'tipOdd'],
      dtype='object')


In [24]:
df.dtypes

Time                   object
League                 object
Country                object
Home Team              object
Away Team              object
Goals Home              int64
Goals Away              int64
Odd Home              float64
Odd Draw              float64
Odd Away              float64
Best Tip               object
Best Tip Trust          int64
Best Tip Odd          float64
Underover              object
Trust Underover       float64
Odd Underover         float64
Final Result           object
Trust Final Result    float64
tipOdd                float64
dtype: object

In [25]:
# Handle missing values if any
df.dropna(inplace=True)  # For demonstration, dropping rows with missing values

# Encode categorical variables
df= pd.get_dummies(df, columns=["League", "Country", "Home Team", "Away Team"])


In [28]:
# Normalize numerical variables
scaler = StandardScaler()
numerical_cols = ["Goals Home", "Goals Away", "Odd Home", "Odd Draw", "Odd Away", "Best Tip Odd", "Odd Underover"]
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])


In [29]:
# Feature Engineering
df['goal_difference'] = df['Goals Home'] - df['Goals Away']
df['odd_difference'] = df['Odd Home'] - df['Odd Away']

In [30]:

# Drop irrelevant columns
df.drop(['Time'], axis=1, inplace=True)

In [32]:
# Save preprocessed dataset
df.to_csv("preprocessed_dataset.csv", index=False)


In [33]:

# Downcast numeric data types
numeric_cols = df.select_dtypes(include=['int', 'float']).columns
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, downcast='integer')  # or 'float'

# Encode categorical variables with sparse matrix
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, sparse=True)

In [35]:
# Store the compressed dataset
df.to_csv("compressed_dataset.csv.gz", compression="gzip", index=False)

  df.to_csv("compressed_dataset.csv.gz", compression="gzip", index=False)


In [None]:
# Load the compressed dataset
df = pd.read_csv("compressed_dataset.csv.gz")

In [None]:
# Split dataset into features (X) and target variable (y)
X = df.drop("Final Result", axis=1)  # Assuming "Final Result" is the target variable
y = df["Final Result"]


In [None]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.linear_model import LogisticRegression
# Initialize and train the Random Forest model
model = LogisticRegression()
model.fit(X_train, y_train)


In [None]:
# Make predictions
y_pred = model.predict(X_test)

In [None]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)