# Prediction of teams that will reach the Playoffs

## Section: Imports and Datasets loading

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
! pip install tabulate
from tabulate import tabulate
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency

players = pd.read_csv('basketballPlayoffs/players.csv', delimiter=",")
coaches = pd.read_csv('basketballPlayoffs/coaches.csv', delimiter=",")
teams = pd.read_csv('basketballPlayoffs/teams.csv', delimiter=",")
players_teams = pd.read_csv('basketballPlayoffs/players_teams.csv', delimiter=",")
teams_post = pd.read_csv('basketballPlayoffs/teams_post.csv', delimiter=",")
series_post = pd.read_csv('basketballPlayoffs/series_post.csv', delimiter=",")
awards_players = pd.read_csv('basketballPlayoffs/awards_players.csv', delimiter=",")
awards_coaches = pd.read_csv('basketballPlayoffs/awards_coaches.csv', delimiter=",")

print(players.head())
print(coaches)
print(teams)
print(players_teams)
print(teams_post)
print(series_post)
print(awards_players)
print(awards_coaches)

## Section: Exploratory Data Analysis (EDA)

### Overview
This section focuses on explore each dataset to gain insights into the data. This step will help you understand the relationships between different features and the target variable. This process involves:

#### **Teams**:

1. **Checking for missing values**
2. **Counting unique values per column**
3. **Visualize distributions**
4. **Visualize correlations and patterns**: Create a correlation matrix, ``correlation_matrix``, to evaluate the relationship between variables and performing a chi-square test, ``chi_square``, to evaluate the independence between each categorical feature and a specified target variable.

#### **Players**:

1.  **Checking for missing values**
2.  **Counting unique values per column**
3.  **Compare players height**: Create a graphic where heights are measure in cm and divided: ``<160.0``, ``160.0-170.0``, ``170.0-180.0``, ``180.0-190.0``, ``190.0-200.0``, ``>200.0``
4.  **Visualize the number of players per position**: Create a graphic where we visualize the distribution of player position: ``G``, ``F``, ``C``, ``F-C``, ``G-F``, ``C-F``
5.  **Visualize players top colleges**
6.  **Visualize correlations and patterns**: Create a correlation matrix, ``correlation_matrix``, to evaluate the relationship between variables

#### **Teams Post Season**

1. **Checking for missing values**
2. **Visualize win-loss ratios**: Calculate win-loss ratios with ``Win`` and ``Loss`` values, then create a ``Bar Chart`` to visualize the results

#### **Series Post Season**

1. **Visualize data insigths**:
   - Teams that won in the playoffs each year
   - Teams that won and lost each year
   - Total appearances of each team in the finals

#### **Coaches**

1. **Checking for missing values**
2. **Extract the wins and losses data**: Create a ``Scater Plot`` to visualize wins and losses data about coaches

#### **Awards Players**
1. **Checking for missing values**

#### **Awards Coaches**
1. **Checking for missing values**

### Teams metrics

In [None]:
teams.head()

teams.isnull().sum()

In [None]:
for column in teams.columns:
    unique_values = teams[column].unique()
    print(f"Number of different values in the {column} column are:", len(unique_values))
    print("------------")

In [None]:
columns = []
value_counts = []

for column in teams.columns:
    unique_values = teams[column].nunique()
    columns.append(column)
    value_counts.append(unique_values)

plt.figure(figsize=(10, 10))
plt.barh(columns, value_counts, color='skyblue')
plt.xlabel('Number of Unique Values')
plt.ylabel('Columns')
plt.title('Number of Unique Values in Each Column')
plt.show()

In [None]:
print(teams.dtypes)

In [None]:
teams_numeric = teams.copy()

for column in teams_numeric.columns:
    if teams_numeric[column].dtype == 'object':
        teams_numeric[column] = teams_numeric[column].astype('category').cat.codes

teams_numeric.describe()

In [None]:
def correlation_matrix(dataframe):
    corr_matrix = dataframe.corr()

    target_correlation = corr_matrix['playoff']

    plt.figure(figsize=(30, 20))

    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

    sns.heatmap(corr_matrix, mask=mask, annot=True, annot_kws={"size": 8}, cmap='coolwarm', linewidths=0.5, fmt=".2f")

    plt.title('Correlation Matrix', fontsize=16)
    plt.show()

    dict = {}

    for feature, correlation in target_correlation.items():
        print(f"Correlation between target and {feature}: {correlation}")
        dict[feature] = correlation

    return dict

In [None]:
correlation_matrix(teams_numeric)

In [None]:
def chi_square(dataset, target): 
    for feature in dataset.columns:
        if feature != target:
            contingency_table = pd.crosstab(dataset[feature], dataset[target])

            # check if any category has no data
            if contingency_table.shape[0] == 0 or contingency_table.shape[1] == 0:
                print(f"No data for {feature} and {target}")
                continue
            
            chi2, p, observed, expected = chi2_contingency(contingency_table)
            
            # Step 4: Print or store the results
            print(f"Chi-square test for {feature} and {target}:")
            print(f"Chi-square value: {chi2}")
            print(f"P-value: {p}")
            print("")

In [None]:
chi_square(teams, 'playoff')

The p-value indicates the probability of observing a relationship as extreme as the one in our sample data, assuming that there is no actual relationship in the population.

### Players metrics

In [None]:
print(players['playerID'].nunique()) 

print(players.head())

players.isnull().sum()

In [None]:
# Count the rows where 'firstseason' is not equal to 0
non_zero_firstseason_count = len(players[players['firstseason'] != 0])

# Count the rows where 'firstseason' is not equal to 0
non_zero_lastseason_count = len(players[players['lastseason'] != 0])

# Count the rows where 'deathDate' is not equal to "0000-00-00"
players['deathDate'] = players['deathDate'].str.strip()
non_empty_deathDate_count = len(players[players['deathDate'] != "0000-00-00"])

# Count the rows where 'collegeOther' is not equal to ""
non_nan_collegeOther_count = players['collegeOther'].notna().sum()

print("Number of rows with 'firstseason' different from 0:", non_zero_firstseason_count)
print("Number of rows with 'lastseason' different from 0:", non_zero_lastseason_count)
print("Number of rows with 'collegeOther' different from "":", non_nan_collegeOther_count)
print("Number of rows with 'deathDate' different from '0000-00-00':", non_empty_deathDate_count)

#### Players heights comparison

In [None]:
# Convert height from inches to centimeters
players['height_cm'] = players['height'] * 2.54  # 1 inch = 2.54 cm

# Define height categories in centimeters
height_categories = ['< 160.0 cm', '160.0 - 170.0 cm', '170.0 - 180.0 cm', '180.0 - 190.0 cm', '190.0 - 200.0 cm', '> 200.0 cm']

# Define the height ranges for each category
height_ranges = [(0, 160.0), (160.0, 170.0), (170.0, 180.0), (180.0, 190.0), (190.0, 200.0), (200.0, float('inf'))]

# Create a new column in the dataset to store the height category for each player
players['height_category'] = pd.cut(players['height_cm'], bins=[r[0] for r in height_ranges] + [float('inf')], labels=height_categories)

# Count the number of players in each height category
height_category_counts = players['height_category'].value_counts().reindex(height_categories, fill_value=0)

# Create a bar plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=height_category_counts.index, y=height_category_counts.values, palette='Set2')

# Add labels to the bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points')

# Set plot title and labels
plt.title('Player Count in Height Categories')
plt.xlabel('Height Category')
plt.ylabel('Count')

# Show the plot
plt.show()

#### Number of players in each position

In [None]:
# Create a countplot for player positions
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
ax = sns.countplot(data=players, x='pos', order=players['pos'].value_counts().index, palette='Set2')

# Add labels to the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points')

# Set plot title and labels
plt.title('Number of Players in Each Position')
plt.xlabel('Position')
plt.ylabel('Count')

# Rotate x-axis labels for better readability (optional)
plt.xticks(rotation=45)

empty_pos_count = players['pos'].isnull().sum()
print("Number of rows with empty 'pos':", empty_pos_count)

# Show the plot
plt.show()

#### Top 10 Colleges

In [None]:
# Get the top 10 colleges with the most players
top_10_colleges = players['college'].value_counts().iloc[:10]

# Create a countplot for the top 10 colleges
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed
ax = sns.countplot(data=players, x='college', order=top_10_colleges.index, palette='Set2')

# Add labels to the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points')

# Set plot title and labels
plt.title('Top 10 Colleges with the Most Players')
plt.xlabel('College')
plt.ylabel('Count')

# Rotate x-axis labels for better readability (optional)
plt.xticks(rotation=90)

# Show the plot
plt.show()

#### Correlation Matrix between numeric columns

In [None]:
# Select the columns for correlation analysis
numeric_columns = ["firstseason", "lastseason", "height", "weight"]

# Create a subset of the dataset with only the numeric columns
subset = players[numeric_columns]

# Calculate the correlation matrix
corr_matrix = subset.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

### Teams Post metrics

In [None]:
print(teams_post.head())

teams_post.isnull().sum()

In [None]:
# Calculate win-loss ratios
teams_post['Win-Loss Ratio'] = teams_post['W'] / (teams_post['W'] + teams_post['L'])

# Create a horizontal bar chart
plt.figure(figsize=(12, 8))
plt.barh(teams_post['tmID'], teams_post['Win-Loss Ratio'], color='skyblue')

# Add labels and title
plt.xlabel('Win-Loss Ratio')
plt.ylabel('Team ID (tmID)')
plt.title('Win-Loss Ratios for Teams on  Post-Season (based on tmID)')

# Show the chart
plt.show()

### Series Post metrics

In [None]:
# Group the data by year and count the number of wins for each team in each year
team_wins_by_year = series_post.groupby(['year', 'tmIDWinner'])['W'].count().unstack(fill_value=0)

# Create a stacked bar chart
plt.figure(figsize=(12, 8))
team_wins_by_year.plot(kind='bar', stacked=True, colormap='Set3')
plt.xlabel('Year')
plt.ylabel('Number of Wins')
plt.title('Teams That Won In The Playoffs Each Year')

# Show the chart
plt.legend(title='Team', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

#### Teams that won and lost each year

In [None]:
# Filter for the "F" (Finals) round
finals_data = series_post[series_post['round'] == 'F']

# Create a DataFrame with the winning and losing teams for each year
finals_results = finals_data[['year', 'tmIDWinner', 'tmIDLoser']]

# Convert the DataFrame to a prettily formatted table
table = tabulate(finals_results, headers='keys', tablefmt='fancy_grid', showindex=False)

# Display the formatted table
print(table)

In [None]:
# Filter for the "F" (Finals) round
finals_data = series_post[series_post['round'] == 'F']

# Count how many times each team has appeared in the Finals as either a winner or a loser
team_appearances = pd.concat([finals_data['tmIDWinner'], finals_data['tmIDLoser']]).value_counts()

# Create a horizontal bar chart
plt.figure(figsize=(12, 6))
team_appearances.plot(kind='barh', color='skyblue')

# Add labels and title
plt.ylabel('Team (tmID)')
plt.xlabel('Number of Finals Appearances')
plt.title('Total Appearances of Each Team in the Finals')

# Show the chart
plt.show()

### Coaches metrics

In [None]:
print(coaches['coachID'].nunique()) 

print(coaches.head())

coaches.isnull().sum()

In [None]:
# Extract the wins and losses data from the "won" and "lost" columns
coach_wins = coaches['won']
coach_losses = coaches['lost']

# Create a scatter plot
import matplotlib.pyplot as plt

plt.scatter(coach_wins, coach_losses, alpha=0.5)
plt.xlabel('Wins')
plt.ylabel('Losses')
plt.title('Scatter Plot of Coach Wins vs. Losses')
plt.show()

### Awards Players

In [None]:
print(awards_players.isna().sum())
print(awards_players.head())

### Awards Coaches

In [None]:
print(awards_coaches.isna().sum())
print(awards_coaches.head())

## Section: Feature Selection
This section focuses on identify and select relevant features for your prediction model. Use techniques such as correlation analysis, recursive feature elimination, or feature importance from tree-based models.


### Teams dataset

In [None]:
original_teams = teams.copy()

In [None]:
# to delete: lgID, divID, seeded, tmORB, tmDRB, tmTRB, opptmORB, opptmDRB, opptmTRB, rank, firstRound, semis, finals
print(teams.isna().sum())
   
feature_selection_result = teams.drop(columns=['lgID', 'divID', 'seeded', 'tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB'])
feature_selection_result = feature_selection_result.drop(columns=['rank', 'firstRound', 'semis', 'finals'])
feature_selection_result = feature_selection_result.drop(columns=['min', 'o_oreb', 'o_dreb', 'd_oreb', 'd_dreb', 'name', 'franchID'])

feature_selection_result.head()

In [None]:
feature_selection_result.to_csv('filtered/feature_selection_dataset.csv', index=False)

## Section: Feature Engineering

This section focuses on create new features that might enhance the predictive power of your model. This could involve transforming existing features, creating interaction terms, or incorporating external data.

The feauture enginnering is done inside the `players.ipynb` file and the creation of the new dataset is done inside `create_final_team.csv` file.

### Shifting target variable

In [None]:
def shift_target_variable(dataset):
    dataset.sort_values(by=['tmID', 'year'], inplace=True)

    dataset['playoffs'] = dataset.groupby('tmID')['playoff'].shift(-1)

    dataset.drop(columns=['playoff'], inplace=True)

    dataset.dropna(subset=['playoffs'], inplace=True)

    dataset.rename(columns={'playoffs': 'playoff'}, inplace=True)

    dataset.reset_index(drop=True, inplace=True)

    return dataset

#### Original dataset

In [None]:
original_teams = shift_target_variable(original_teams)
original_teams.to_csv('filtered/original_teams.csv', index=False)

In [None]:
original_teams_numeric = original_teams.copy()

for column in original_teams_numeric.columns:
    if original_teams_numeric[column].dtype == 'object':
        original_teams_numeric[column] = original_teams_numeric[column].astype('category').cat.codes

correlation_matrix(original_teams_numeric)

#### Feature Selection dataset

In [None]:
feature_selection_result = shift_target_variable(feature_selection_result)

In [None]:
feature_teams_numeric = feature_selection_result.copy()

for column in feature_teams_numeric.columns:
    if feature_teams_numeric[column].dtype == 'object':
        feature_teams_numeric[column] = feature_teams_numeric[column].astype('category').cat.codes

correlation_matrix(feature_teams_numeric)

#### Feature engineering dataset

In [None]:
# TODO: verificar se pode ficar assim ou se adicionamos o código cá
feature_engineering_result = pd.read_csv('filtered/feature_engineering_dataset.csv', delimiter=",")

In [None]:
eng1_teams_numeric = shift_target_variable(feature_engineering_result.copy())
eng1_teams_numeric.to_csv('filtered/eng1_teams_numeric.csv', index=False)

In [None]:
eng1_teams_numeric = eng1_teams_numeric.copy()

for column in eng1_teams_numeric.columns:
    if eng1_teams_numeric[column].dtype == 'object':
        eng1_teams_numeric[column] = eng1_teams_numeric[column].astype('category').cat.codes

correlation_matrix(eng1_teams_numeric)

In [None]:
feature_engineering_result2 = pd.read_csv('filtered/team2_before_shift.csv', delimiter=",")
feature_engineering_result2.drop(columns=['playoff'], inplace=True)
copy_fe1 = feature_engineering_result.copy()
copy_fe1.drop(columns=['powerRanking2'], inplace=True)

feature_engineering_result2 = pd.merge(feature_engineering_result2, copy_fe1, on=['tmID', 'year'])
#feature_engineering_result2.to_csv('text.csv', index=False)
feature_engineering_result2 = shift_target_variable(feature_engineering_result2)
feature_engineering_result = shift_target_variable(feature_engineering_result)

feature_engineering_result2.to_csv('text.csv', index=False)

feature_engineering_result2.head()

In [None]:
eng2_teams_numeric = feature_engineering_result2.copy()

for column in eng2_teams_numeric.columns:
    if eng2_teams_numeric[column].dtype == 'object':
        eng2_teams_numeric[column] = eng2_teams_numeric[column].astype('category').cat.codes

correlation_matrix(eng2_teams_numeric)

In [None]:
continuous = ['year', 'average_powerRanking', 'average_PER', 'average_postPowerRanking', 'average_postPER']

In [None]:
eng2_teams_numeric.head()

## Section: Model Training and Classification
This section focuses on create all models, training them on the data. The models created are ``Decision Tree``, ``Random Forest``, ``Logistic Regression``, ``Support Vector Machines``, ``K-Nearest Neighbors``, ``Gradient Boosting``. The process involves:

- **Encoding**
- **Models**

### Encoding

Apply label enconding to non numerical values in ``Original``, ``Feature Selection``, ``Feature Engineering`` datasets

#### Original

In [None]:
#tmID, confID, playoff, arena
# name, franchID, lgID, divID

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to the columns 'teamID', 'franchID', 'confID', 'name', 'arena'
original_teams['tmID'] = label_encoder.fit_transform(original_teams['tmID'])
original_teams['confID'] = label_encoder.fit_transform(original_teams['confID'])
original_teams['arena'] = label_encoder.fit_transform(original_teams['arena'])
original_teams['name'] = label_encoder.fit_transform(original_teams['name'])
original_teams['franchID'] = label_encoder.fit_transform(original_teams['franchID'])
original_teams['lgID'] = label_encoder.fit_transform(original_teams['lgID'])
original_teams['divID'] = label_encoder.fit_transform(original_teams['divID'])
original_teams['firstRound'] = label_encoder.fit_transform(original_teams['firstRound'])
original_teams['semis'] = label_encoder.fit_transform(original_teams['semis'])
original_teams['finals'] = label_encoder.fit_transform(original_teams['finals'])

#### After Feature Selection

In [None]:
feature_selection_result['tmID'] = label_encoder.fit_transform(feature_selection_result['tmID'])
feature_selection_result['confID'] = label_encoder.fit_transform(feature_selection_result['confID'])
feature_selection_result['arena'] = label_encoder.fit_transform(feature_selection_result['arena'])

#### After feature engineering

In [None]:
# Apply label encoding to the columns 'teamID'
feature_engineering_result['tmID'] = label_encoder.fit_transform(feature_engineering_result['tmID'])

In [None]:
feature_engineering_result2['tmID'] = label_encoder.fit_transform(feature_engineering_result2['tmID'])
feature_engineering_result2['confID'] = label_encoder.fit_transform(feature_engineering_result2['confID'])
feature_engineering_result2.to_csv('text.csv', index=False)
""" feature_engineering_result2['arena'] = label_encoder.fit_transform(feature_engineering_result2['arena'])
feature_engineering_result2['firstRound'] = label_encoder.fit_transform(feature_engineering_result2['firstRound'])
feature_engineering_result2['semis'] = label_encoder.fit_transform(feature_engineering_result2['semis'])
feature_engineering_result2['finals'] = label_encoder.fit_transform(feature_engineering_result2['finals']) """

### Models

- **Data Preparation**: Separates features (X) and the target variable (y) from the dataset, splits the data into training and testing sets

- **Hyperparameter Tuning**: Uses GridSearchCV to perform a grid search cross-validation to find the best hyperparameters based on accuracy; Creates a classifier with the best parameters and fits it to the training data

- **Prediction and Team Selection**: Predicts probabilities for the test set using the trained model; Sorts teams in each conference based on predicted probabilities and selects the top 4 teams from each conference

- **Custom Binary Classification**: Creates a binary classification for playoff selection based on a threshold; Plots a learning curve for the random forest model on the training set

- **ROC Curve and AUC**: Plots the ROC curve and calculates the Area Under the Curve (AUC) for the model on the test set

- **Evaluation**: For each model, an evaluation is made for each dataset, using accuracy and classification report

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
import numpy as np

from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler

from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import learning_curve

from sklearn.metrics import accuracy_score, classification_report

In [None]:
def plot_learning_curve(estimator, title, X, y, ylim=None, cv=None, scoring=None, n_jobs=-1, train_sizes=np.linspace(.1, 1.0, 5)):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel(scoring if scoring else 'Score')
    train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, cv=cv, scoring=scoring, n_jobs=n_jobs,
                                                            train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1, color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Validation score")

    plt.legend(loc="best")
    return plt

The dataset is divided into 5 folds, and the learning curve is generated by training and evaluating the model on these 5 folds. The purpose is to visualize how the model's performance changes with the size of the training set while considering different subsets of the data.

#### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Assuming you have the plot_learning_curve function defined
# If not, you can use sklearn's plot_learning_curve or define your own

def decision_tree_model(dataset, year):
    X = dataset.copy()
    y = dataset['playoff'].copy()
    X.drop(columns=['playoff'], inplace=True)

    X_train = dataset[dataset['year'] < year].drop(columns=['playoff'])
    y_train = dataset[dataset['year'] < year]['playoff']

    X_test = dataset[dataset['year'] == year].drop(columns=['playoff'])
    y_test = dataset[dataset['year'] == year]['playoff']

    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 2, 3, 4, 5, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    dt_classifier = DecisionTreeClassifier(random_state=42)
    grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_model = DecisionTreeClassifier(random_state=42, **best_params)
    best_model.fit(X_train, y_train)

    # Predict probabilities
    y_scores = best_model.predict_proba(X_test)[:, 1]

    # Sort teams by predicted probabilities and get the indices
    east_indices = np.argsort(y_scores[X_test['confID'] == '0'])[::-1][:4]
    west_indices = np.argsort(y_scores[X_test['confID'] == '1'])[::-1][:4]

    # Get the indices of the top 4 teams from each conference
    selected_indices = np.concatenate([east_indices, west_indices])

    # Create a binary classification based on threshold
    y_pred_custom = np.zeros(len(X_test))
    y_pred_custom[selected_indices] = 1

    title = "Learning Curves (Decision Tree)"
    scoring = "accuracy"

    # Plot learning curve
    plot_learning_curve(
        best_model,
        title,
        X_train,
        y_train,
        cv=5,
        scoring=scoring,
    )

    plt.show()

    y_test_binary = y_test.map({'N': 0, 'Y': 1})

    y_scores = best_model.predict_proba(X_test)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test_binary, y_scores)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve
    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")

    plt.show()

    # Convert labels to integers for classification_report
    y_test_int = y_test.map({'N': 0, 'Y': 1})

    # Evaluate the custom predictions
    accuracy = accuracy_score(y_test_int, y_pred_custom)
    print(f"Accuracy: {accuracy:.2f}")

    print(classification_report(y_test_int, y_pred_custom, zero_division=1))


##### Original dataset

In [None]:
decision_tree_model(original_teams, 9)

##### Feature Selection dataset

In [None]:
decision_tree_model(feature_selection_result, 9)

##### Feature engineering dataset

In [None]:
decision_tree_model(feature_engineering_result, 9)

In [None]:
decision_tree_model(feature_engineering_result2, 9)

#### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
import matplotlib.pyplot as plt
import numpy as np

# Assuming you have the plot_learning_curve function defined
# If not, you can use sklearn's plot_learning_curve or define your own

def random_forest_model(dataset, year):
    X = dataset.copy()
    y = dataset['playoff'].copy()
    X.drop(columns=['playoff'], inplace=True)

    X_train = dataset[dataset['year'] < year].drop(columns=['playoff'])
    y_train = dataset[dataset['year'] < year]['playoff']

    X_test = dataset[dataset['year'] == year].drop(columns=['playoff'])
    y_test = dataset[dataset['year'] == year]['playoff']

    param_grid = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [None, 10, 15, 20],
        'min_samples_split': [2, 5, 8, 12],
        'min_samples_leaf': [1, 2, 4, 6], 
        'max_features': ['auto', 'sqrt', 'log2', None, 0.8, 0.9]
    }

    rf_classifier = RandomForestClassifier(random_state=42)#, class_weight=class_weights)

    grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

    best_model = RandomForestClassifier(**best_params)

    best_model.fit(X_train, y_train)

    # Predict probabilities
    y_scores = best_model.predict_proba(X_test)[:, 1]

    # Sort teams by predicted probabilities and get the indices
    east_indices = np.argsort(y_scores[X_test['confID'] == '0'])[::-1][:4]
    west_indices = np.argsort(y_scores[X_test['confID'] == '1'])[::-1][:4]

    # Get the indices of the top 4 teams from each conference
    selected_indices = np.concatenate([east_indices, west_indices])

    # Create a binary classification based on threshold
    y_pred_custom = np.zeros(len(X_test))
    y_pred_custom[selected_indices] = 1

    title = "Learning Curves (Random Forest)"
    scoring = "accuracy"

    plot_learning_curve(
        best_model,
        title,
        X_train,
        y_train,
        cv=5,
        scoring=scoring,
    )

    plt.show()

    y_test_binary = y_test.map({'N': 0, 'Y': 1})

    y_scores = best_model.predict_proba(X_test)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test_binary, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")

    plt.show()

    # Convert labels to integers for classification_report
    y_test_int = y_test.map({'N': 0, 'Y': 1})

    # Evaluate the custom predictions
    accuracy = accuracy_score(y_test_int, y_pred_custom)
    print(f"Accuracy: {accuracy:.2f}")

    print(classification_report(y_test_int, y_pred_custom, zero_division=1))


##### Original dataset

In [None]:
random_forest_model(original_teams, 9)

##### Feature Selection dataset

In [None]:
random_forest_model(feature_selection_result, 9)

##### Feature engineering dataset

In [None]:
random_forest_model(feature_engineering_result, 9)

In [None]:
random_forest_model(feature_engineering_result2, 9)

In [None]:
for i in range(4, 10): 
    print("Year: ", i)
    random_forest_model(feature_engineering_result2, i)

In [None]:
#for i in range(1, 8): 
    

#### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
import matplotlib.pyplot as plt

def logistic_regression_model(dataset, year):
    X = dataset.copy()
    y = dataset['playoff'].copy()
    X.drop(columns=['playoff'], inplace=True)

    X_train = dataset[dataset['year'] < year].drop(columns=['playoff'])
    y_train = dataset[dataset['year'] < year]['playoff']

    X_test = dataset[dataset['year'] == year].drop(columns=['playoff'])
    y_test = dataset[dataset['year'] == year]['playoff']

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    param_grid = {
        'C': [0.001, 0.01, 0.1, 1, 10, 100],
        'penalty': ['l1', 'l2'],
        'solver': ['liblinear', 'lbfgs', 'saga'],
        'max_iter': [1000, 10000]
    }

    lr_classifier = LogisticRegression()

    grid_search = GridSearchCV(estimator=lr_classifier, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

    best_model = LogisticRegression(**best_params)

    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)

    title = "Learning Curves (Logistic Regression)"
    scoring = "accuracy"

    plot_learning_curve(
        best_model,
        title,
        X_train,
        y_train,
        cv=5,
        scoring=scoring,
    )

    plt.show()

    y_test_binary = y_test.map({'N': 0, 'Y': 1})

    y_scores = best_model.predict_proba(X_test)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test_binary, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")

    plt.show()

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    print(classification_report(y_test, y_pred, zero_division=1))

##### Original dataset

In [None]:
logistic_regression_model(original_teams, 9)

##### Feature Selection dataset

In [None]:
logistic_regression_model(feature_selection_result, 9)

##### Feature Engineering dataset

In [None]:
logistic_regression_model(feature_engineering_result, 9)

In [None]:
logistic_regression_model(feature_engineering_result2, 9)

In [None]:
for i in range(4, 10):
    print("Year: ", i)
    logistic_regression_model(feature_engineering_result2, i)

#### Support Vector Machines (SVM)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt

def svm_model(dataset, year):
    X = dataset.copy()
    y = dataset['playoff'].copy()
    X.drop(columns=['playoff'], inplace=True)

    X_train = dataset[dataset['year'] < year].drop(columns=['playoff'])
    y_train = dataset[dataset['year'] < year]['playoff']

    X_test = dataset[dataset['year'] == year].drop(columns=['playoff'])
    y_test = dataset[dataset['year'] == year]['playoff']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    param_grid = {
        'C': [0.1, 1],
        'kernel': ['linear', 'rbf'],
        'gamma': ['auto', 'scale'],
    }

    # Enable probability estimates
    svc = SVC(probability=True)

    grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)

    best_params = grid_search.best_params_

    best_model = SVC(**best_params, class_weight={'Y': 5, 'N': 5}, probability=True)

    best_model.fit(X_train_scaled, y_train)

    y_pred = best_model.predict(X_test_scaled)

    title = "Learning Curves (SVM)"
    plot_learning_curve(
        best_model,
        title,
        X_train_scaled,
        y_train,
        cv=5,
        scoring="accuracy",
    )

    plt.show()

    y_test_binary = y_test.map({'N': 0, 'Y': 1})

    # Use predict_proba for obtaining probability estimates
    y_scores = best_model.predict_proba(X_test_scaled)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test_binary, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")

    plt.show()

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    print(classification_report(y_test, y_pred, zero_division=1))

##### Original dataset

In [None]:
svm_model(original_teams, 9)

##### Feature Selection

In [None]:
svm_model(feature_selection_result, 9)

##### Feature engineering dataset

In [None]:
svm_model(feature_engineering_result, 9)

In [None]:
svm_model(feature_engineering_result2, 9)

#### KNN dataset

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

def knn_model(dataset, year):
    X = dataset.copy()
    y = dataset['playoff'].copy()
    X.drop(columns=['playoff'], inplace=True)

    X_train = dataset[dataset['year'] < year].drop(columns=['playoff'])
    y_train = dataset[dataset['year'] < year]['playoff']

    X_test = dataset[dataset['year'] == year].drop(columns=['playoff'])
    y_test = dataset[dataset['year'] == year]['playoff']

    scaler = StandardScaler()
    #scaler = RobustScaler()
    #scaler = MinMaxScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    k_best_selector = SelectKBest(f_classif, k='all')
    X_train_selected = k_best_selector.fit_transform(X_train_scaled, y_train)

    selected_features = X_train.columns[k_best_selector.get_support()]
    print("Selected Features:", selected_features)

    param_grid = {
        'n_neighbors': [1, 3, 5, 7, 10],
        'metric': ['euclidean', 'manhattan', 'minkowski'],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p': [1, 2]
    }

    knn_classifier = KNeighborsClassifier()
    
    grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train_selected, y_train)

    best_params = grid_search.best_params_

    best_model = KNeighborsClassifier(**best_params)

    best_model.fit(X_train_selected, y_train)

    X_test_selected = k_best_selector.transform(X_test_scaled)
    y_pred = best_model.predict(X_test_selected)

    title = "Learning Curves (KNN)"
    scoring = "accuracy"

    plot_learning_curve(
        best_model,
        title,
        X_train_selected,
        y_train,
        cv=5,
        scoring=scoring,
    )

    plt.show()

    y_test_binary = y_test.map({'N': 0, 'Y': 1})

    y_scores = best_model.predict_proba(X_test_selected)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test_binary, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")

    plt.show()

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    print(classification_report(y_test, y_pred))

##### Original dataset

In [None]:
knn_model(original_teams, 9)

##### Feature Selection dataset

In [None]:
knn_model(feature_selection_result, 9)

##### Feature engineering dataset

In [None]:
knn_model(feature_engineering_result, 9)

In [None]:
knn_model(feature_engineering_result2, 9)

#### Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

def gradient_boosting_model(dataset, year):
    X = dataset.copy()
    y = dataset['playoff'].copy()
    X.drop(columns=['playoff'], inplace=True)

    X_train = dataset[dataset['year'] < year].drop(columns=['playoff'])
    y_train = dataset[dataset['year'] < year]['playoff']

    X_test = dataset[dataset['year'] == year].drop(columns=['playoff'])
    y_test = dataset[dataset['year'] == year]['playoff']

    gb_classifier = GradientBoostingClassifier()

    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.8, 1.0],
    }

    grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

    best_model = GradientBoostingClassifier(**best_params)

    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)

    title = "Learning Curves (Gradient Boosting)"
    scoring = "accuracy"

    # Assuming you have a plot_learning_curve function defined
    plot_learning_curve(best_model, title, X_train, y_train, cv=5, scoring=scoring)
    plt.show()

    y_test_binary = y_test.map({'N': 0, 'Y': 1})

    y_scores = best_model.predict_proba(X_test)[:, 1]

    fpr, tpr, thresholds = roc_curve(y_test_binary, y_scores)
    roc_auc = auc(fpr, tpr)

    plt.figure(figsize=(8, 8))
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = {:.2f})'.format(roc_auc))
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlabel('False Positive Rate (FPR)')
    plt.ylabel('True Positive Rate (TPR)')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")

    plt.show()

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")

    print(classification_report(y_test, y_pred))

##### Original dataset

In [None]:
gradient_boosting_model(original_teams, 9)

##### Feature Selection dataset

In [None]:
gradient_boosting_model(feature_selection_result, 9)

##### Feature Engineering dataset

In [None]:
gradient_boosting_model(feature_engineering_result, 9)

In [None]:
gradient_boosting_model(feature_engineering_result2, 9)

## Section: Model Evaluation
This section focuses on observe how the models perfomance changes when trained on data from varying numbers of previous years.

### Decision Tree

In [None]:
for i in range(1, 9): 
    print("Training year 9 based on previous ", i, " years.")
    filtered_feature_engineering_result2 = feature_engineering_result2[feature_engineering_result2['year'] >= 9 - i]
    decision_tree_model(filtered_feature_engineering_result2, 9)

### Random Forest

In [None]:
for i in range(1, 9): 
    print("Training year 9 based on previous ", i, " years.")
    filtered_feature_engineering_result2 = feature_engineering_result2[feature_engineering_result2['year'] >= 9 - i]
    random_forest_model(filtered_feature_engineering_result2, 9)

### Logistic Regression

In [None]:
for i in range(1, 9): 
    print("Training year 9 based on previous ", i, " years.")
    filtered_feature_engineering_result2 = feature_engineering_result2[feature_engineering_result2['year'] >= 9 - i]
    logistic_regression_model(filtered_feature_engineering_result2, 9)

### SVM

In [None]:
for i in range(1, 9): 
    print("Training year 9 based on previous ", i, " years.")
    filtered_feature_engineering_result2 = feature_engineering_result2[feature_engineering_result2['year'] >= 9 - i]
    svm_model(filtered_feature_engineering_result2, 9)

### KNN

In [None]:
for i in range(1, 9): 
    print("Training year 9 based on previous ", i, " years.")
    filtered_feature_engineering_result2 = feature_engineering_result2[feature_engineering_result2['year'] >= 9 - i]
    knn_model(filtered_feature_engineering_result2, 9)

### Gradient Boosting

In [None]:
for i in range(1, 9): 
    print("Training year 9 based on previous ", i, " years.")
    filtered_feature_engineering_result2 = feature_engineering_result2[feature_engineering_result2['year'] >= 9 - i]
    gradient_boosting_model(filtered_feature_engineering_result2, 9)

## Section: Testing for Final Dataset (Year 11)

### Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd

def select_top_teams(predictions_df):
    # Select the top 4 teams with the highest probabilities for each conference
    selected_teams = predictions_df.groupby('confID').apply(lambda x: x.nlargest(4, 'Probability')).reset_index(drop=True)
    return selected_teams

def final_decision_tree(dataset_train, dataset_predict):
    X_train = dataset_train.drop(columns=['playoff'])
    y_train = dataset_train['playoff']

    X_test = dataset_predict

    param_grid = {
        'criterion': ['gini', 'entropy'],
        'max_depth': [None, 2, 3, 4, 5, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }

    class_weights = {'N': 1, 'Y': 14}

    dt_classifier = DecisionTreeClassifier(class_weight=class_weights)
    
    grid_search = GridSearchCV(estimator=dt_classifier, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

    best_model = DecisionTreeClassifier(**best_params)

    best_model.fit(X_train, y_train)

    # Predict probabilities for each team
    y_probabilities = best_model.predict_proba(X_test)[:, 1]

    # Combine predicted probabilities with team information
    predictions_df = pd.DataFrame({'TeamID': X_test['tmID'], 'Probability': y_probabilities, 'confID': X_test['confID']})

    # Select the top teams
    selected_teams = select_top_teams(predictions_df)

    # Display the selected teams
    print(selected_teams)

### Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd

def select_top_teams(predictions_df):
    # Select the top 4 teams with the highest probabilities for each conference
    selected_teams = predictions_df.groupby('confID').apply(lambda x: x.nlargest(4, 'Probability')).reset_index(drop=True)
    return selected_teams

def final_random_forest(dataset_train, dataset_predict):
    X_train = dataset_train.drop(columns=['playoff'])
    y_train = dataset_train['playoff']

    X_test = dataset_predict

    param_grid = {
        'n_estimators': [50, 100, 200, 300],
        'max_depth': [None, 10, 15, 20],
        'min_samples_split': [2, 5, 8, 12],
        'min_samples_leaf': [1, 2, 4, 6], 
        'max_features': ['auto', 'sqrt', 'log2', None, 0.8, 0.9]
    }

    class_weights = {'N': 1, 'Y': 8}

    rf_classifier = RandomForestClassifier()#class_weight=class_weights)
    grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

    best_model = RandomForestClassifier(**best_params)

    best_model.fit(X_train, y_train)

    # Predict probabilities for each team
    y_probabilities = best_model.predict_proba(X_test)[:, 1]

    # Combine predicted probabilities with team information
    predictions_df = pd.DataFrame({'TeamID': X_test['tmID'], 'Probability': y_probabilities, 'confID': X_test['confID']})

    # Select the top teams
    selected_teams = select_top_teams(predictions_df)

    # Display the selected teams
    print(selected_teams)

### Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd

def select_top_teams(predictions_df):
    selected_teams = predictions_df.groupby('confID').apply(lambda x: x.nlargest(4, 'Probability')).reset_index(drop=True)
    return selected_teams

def final_logistic_regression(dataset_train, dataset_predict):
    X_train = dataset_train.drop(columns=['playoff'])
    y_train = dataset_train['playoff']

    X_test = dataset_predict

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    param_grid = {
        'penalty': ['l1', 'l2'],
        'C': [0.001, 0.01, 0.1, 1, 10, 100]
    }

    log_reg = LogisticRegression(solver='liblinear')

    grid_search = GridSearchCV(estimator=log_reg, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)

    best_params = grid_search.best_params_

    best_model = LogisticRegression(**best_params, solver='liblinear')
    best_model.fit(X_train_scaled, y_train)

    y_probabilities = best_model.predict_proba(X_test_scaled)[:, 1]

    predictions_df = pd.DataFrame({'TeamID': dataset_predict['tmID'], 'Probability': y_probabilities, 'confID': dataset_predict['confID']})

    selected_teams = select_top_teams(predictions_df)

    print(selected_teams)

### Support Vector Machine (SVC)

In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
import pandas as pd

def select_top_teams(predictions_df):
    # Select the top 4 teams with the highest probabilities for each conference
    selected_teams = predictions_df.groupby('confID').apply(lambda x: x.nlargest(4, 'Probability')).reset_index(drop=True)
    return selected_teams

def final_svm(dataset_train, dataset_predict):
    X_train = dataset_train.drop(columns=['playoff'])
    y_train = dataset_train['playoff']

    X_test = dataset_predict

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    param_grid = {
        'C': [0.1, 1],
        'kernel': ['linear', 'rbf'],
        'gamma': ['auto', 'scale'],
    }

    svc = SVC()

    grid_search = GridSearchCV(estimator=svc, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train_scaled, y_train)

    best_params = grid_search.best_params_

    best_model = SVC(**best_params)

    best_model.fit(X_train_scaled, y_train)

    # Predict probabilities for each team
    y_probabilities = best_model.decision_function(X_test_scaled)

    # Combine predicted probabilities with team information
    predictions_df = pd.DataFrame({'TeamID': X_test['tmID'], 'Probability': y_probabilities, 'confID': X_test['confID']})

    # Select the top teams
    selected_teams = select_top_teams(predictions_df)

    # Display the selected teams
    print(selected_teams)

### K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd

def select_top_teams(predictions_df):
    # Select the top teams with the highest probabilities for each conference
    selected_teams = predictions_df.groupby('confID').apply(lambda x: x.nlargest(4, 'Probability')).reset_index(drop=True)
    return selected_teams

def final_knn(dataset_train, dataset_predict):
    # Extracting features and target variable for training set
    X_train = dataset_train.drop(columns=['playoff'])
    y_train = dataset_train['playoff']

    # Extracting features for prediction set
    # Extracting features for prediction set
    X_test = dataset_predict

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    k_best_selector = SelectKBest(f_classif, k='all')
    X_train_selected = k_best_selector.fit_transform(X_train_scaled, y_train)
    X_test_selected = k_best_selector.transform(X_test_scaled)  # Fix this line

    selected_features = X_train.columns[k_best_selector.get_support()]
    print("Selected Features:", selected_features)


    param_grid = {
        'n_neighbors': [1, 3, 5, 7, 10],
        'metric': ['euclidean', 'manhattan', 'minkowski'],
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
        'p': [1, 2]
    }

    knn_classifier = KNeighborsClassifier()

    grid_search = GridSearchCV(estimator=knn_classifier, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train_selected, y_train)

    best_params = grid_search.best_params_

    best_model = KNeighborsClassifier(**best_params)

    best_model.fit(X_train_selected, y_train)

    # Predict probabilities for each team
    y_probabilities = best_model.predict_proba(X_test_selected)[:, 1]

    # Combine predicted probabilities with team information
    predictions_df = pd.DataFrame({'TeamID': X_test['tmID'], 'Probability': y_probabilities, 'confID': X_test['confID']})

    # Select the top teams
    selected_teams = select_top_teams(predictions_df)

    # Display the selected teams
    print(selected_teams)

### Gradient Boost

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd

def select_top_teams(predictions_df):
    # Select the top 4 teams with the highest probabilities for each conference
    selected_teams = predictions_df.groupby('confID').apply(lambda x: x.nlargest(4, 'Probability')).reset_index(drop=True)
    return selected_teams

def final_gradient_boosting(dataset_train, dataset_predict):
    X_train = dataset_train.drop(columns=['playoff'])
    y_train = dataset_train['playoff']

    X_test = dataset_predict

    gb_classifier = GradientBoostingClassifier()

    param_grid = {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.8, 1.0],
    }

    grid_search = GridSearchCV(estimator=gb_classifier, param_grid=param_grid, scoring='accuracy', cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_

    best_model = GradientBoostingClassifier(**best_params)

    best_model.fit(X_train, y_train)

    # Predict probabilities for each team
    y_probabilities = best_model.predict_proba(X_test)[:, 1]

    # Combine predicted probabilities with team information
    predictions_df = pd.DataFrame({'TeamID': X_test['tmID'], 'Probability': y_probabilities, 'confID': X_test['confID']})

    # Select the top teams
    selected_teams = select_top_teams(predictions_df)

    # Display the selected teams
    print(selected_teams)

### Select Features

In [None]:
selected_features = feature_engineering_result2.columns.to_list()
selected_features.remove('playoff')
print(selected_features)

feature_engineering_dataset = pd.read_csv('filtered/feature_engineering_dataset.csv', delimiter=",")
feature_engineering_dataset.drop(columns=['playoff', 'powerRanking2'], inplace=True)

testing_data = pd.read_csv('filtered/team2_before_shift.csv', delimiter=",")
#testing_data.to_csv('filtered/testing_data1.csv', index=False)
testing_data = pd.merge(testing_data, feature_engineering_dataset, on=['tmID', 'year'])
testing_data['tmID'] = label_encoder.fit_transform(testing_data['tmID'])
testing_data['confID'] = label_encoder.fit_transform(testing_data['confID'])
training_data = testing_data.copy()
testing_data.drop(columns=['playoff'], inplace=True)
testing_data = testing_data[selected_features]
testing_data = testing_data[testing_data['year'] == 10]

testing_data.to_csv('filtered/testing_data.csv', index=False)

training_data = shift_target_variable(training_data)

training_data.to_csv('filtered/training_data.csv', index=False)

final_decision_tree(training_data, testing_data)

### Final Evaluation

In [None]:
final_random_forest(training_data, testing_data)

In [None]:
final_logistic_regression(training_data, testing_data)

In [None]:
final_svm(training_data, testing_data)

In [None]:
final_gradient_boosting(training_data, testing_data)

In [None]:
final_knn(training_data, testing_data)