In [1]:
# Imports
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

# Data preparation functions
def read_csv_from_folder(folder_path, csv_filename):
    csv_file_path = os.path.join(folder_path, csv_filename)

    if not os.path.exists(csv_file_path):
        print(f"CSV file '{csv_filename}' not found in '{folder_path}'. Please check the file path.")
        return None

    try:
        df = pd.read_csv(csv_file_path)
        print("CSV file has been successfully loaded.")
        return df
    except Exception as e:
        print(f"Error while reading CSV file: {e}")
        return None

def positions_hot_encoder(data):
    encoder = OneHotEncoder(drop='if_binary')
    positions_encoded = encoder.fit_transform(data[['position']])
    positions_encoded_df = pd.DataFrame(positions_encoded.toarray(), columns=encoder.get_feature_names_out(['position']))
    data_encoded = pd.concat([data, positions_encoded_df], axis=1)
    data_encoded.drop('position', axis=1, inplace=True)
    return data_encoded

def ETL_process(folder_path, csv_filename, columns_to_drop):
    df = read_csv_from_folder(folder_path, csv_filename)
    if df is not None:
        df = df.drop(columns_to_drop, axis=1)
        df = positions_hot_encoder(df)
        return df
    else:
        return None

# Modeling functions
def predict_market_value(data, feature_columns, target_column):
    X = data[feature_columns]
    y = data[target_column]

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = Pipeline([
        ('regressor', LinearRegression())
    ])

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    results_df = pd.DataFrame({
        'actual_market_value': y_test,
        'predicted_market_value': y_pred
    })

    def print_results():
        mae = mean_absolute_error(y_test, y_pred)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"Mean Absolute Error (MAE): {mae:.2f}")
        print(f"Mean Squared Error (MSE): {mse:.2f}")
        print(f"R-squared (R2): {r2:.2f}")

    print_results()

    return results_df

# Data visualization functions
def visualize_data(df, results_df):
    # Distribution of actual and predicted values with different colors
    plt.figure(figsize=(12, 6))

    # Distribution plot
    plt.subplot(1, 2, 1)
    sns.histplot(data=results_df, x='actual_market_value', kde=True, label='Actual Market Value', color='blue')
    sns.histplot(data=results_df, x='predicted_market_value', kde=True, label='Predicted Market Value', color='orange')
    plt.title("Distribution of Actual vs. Predicted Market Values")
    plt.xlabel("Market Value")
    plt.ylabel("Frequency")
    plt.legend()

    # Scatter plot
    plt.figure(figsize=(6, 6))
    plt.scatter(results_df['actual_market_value'], results_df['predicted_market_value'], color='green', alpha=0.5)
    plt.title("Predicted vs. Actual Market Values")
    plt.xlabel("Actual Market Value")
    plt.ylabel("Predicted Market Value")

    # Set same limits on both axes and maintain the same scale
    axis_lim = (0, max(results_df['actual_market_value'].max(), results_df['predicted_market_value'].max()))
    plt.xlim(axis_lim)
    plt.ylim(axis_lim)

    plt.tight_layout()
    plt.show()

    # Interactive scatter plot using Plotly
    fig = px.scatter(df, x="feature1", y="feature2", color="target_column")
    fig.show()

# Main function
def main():
    folder_path = "csv_path"
    csv_filename = "players_database.csv"
    columns_to_drop = ['skills', 'weak_foot', 'club', 'pot_min', 'pot_max', 'pierna_buena',
                      'season_promoted', 'traits', 'country']

    df = ETL_process(folder_path, csv_filename, columns_to_drop)

    if df is not None:
        data = df
        feature_columns = ['starting_age', 'starting_rating',
               'position_CAM', 'position_CB', 'position_CDM',
               'position_CM', 'position_GK', 'position_LB', 'position_LW',
               'position_RB', 'position_RW', 'position_ST']
        target_column = 'starting_market_value'
        results_df = predict_market_value(data, feature_columns, target_column)
        visualize_data(df, results_df)


if __name__ == "__main__":
    main()


CSV file 'players_database.csv' not found in 'E:\Estudios\Programacion\github\Fifa Projects\Prediction Models'. Please check the file path.
