# Prediction of teams that will reach the Playoffs

## Data Import

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime
! pip install tabulate
from tabulate import tabulate
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency

players = pd.read_csv('basketballPlayoffs/players.csv', delimiter=",")
coaches = pd.read_csv('basketballPlayoffs/coaches.csv', delimiter=",")
teams = pd.read_csv('basketballPlayoffs/teams.csv', delimiter=",")
players_teams = pd.read_csv('basketballPlayoffs/players_teams.csv', delimiter=",")
teams_post = pd.read_csv('basketballPlayoffs/teams_post.csv', delimiter=",")
series_post = pd.read_csv('basketballPlayoffs/series_post.csv', delimiter=",")
awards_players = pd.read_csv('basketballPlayoffs/awards_players.csv', delimiter=",")
awards_coaches = pd.read_csv('basketballPlayoffs/awards_coaches.csv', delimiter=",")

print(players.head())
print(coaches)
print(teams)
print(players_teams)
print(teams_post)
print(series_post)
print(awards_players)
print(awards_coaches)

## Data cleaning

Check for missing values, outliers, and inconsistencies in the data. Clean and preprocess the data to ensure it's ready for analysis.

### Teams dataset

In [None]:
print(teams.isna().sum())
print(teams.head())

### Players Teams dataset

In [None]:
print(players_teams.isna().sum())
print(players_teams.head())

### Players dataset

In [None]:
print(players.isna().sum())
print(players.head())

### Coaches dataset

In [None]:
print(coaches.isna().sum())
print(coaches.head())

### Awards Players dataset

In [None]:
print(awards_players.isna().sum())
print(awards_players.head())

### Awards Coaches dataset

In [None]:
print(awards_coaches.isna().sum())
print(awards_coaches.head())

## Exploratory Data Analysis (EDA)

Conduct EDA to gain insights into the data. Visualize distributions, correlations, and patterns. This step will help you understand the relationships between different features and the target variable.

### Teams metrics

In [None]:
teams.head()

teams.isnull().sum()

In [None]:
for column in teams.columns:
    unique_values = teams[column].unique()
    print(f"Number of different values in the {column} column are:", len(unique_values))
    print("------------")

In [None]:
columns = []
value_counts = []

for column in teams.columns:
    unique_values = teams[column].nunique()
    columns.append(column)
    value_counts.append(unique_values)

plt.figure(figsize=(10, 10))
plt.barh(columns, value_counts, color='skyblue')
plt.xlabel('Number of Unique Values')
plt.ylabel('Columns')
plt.title('Number of Unique Values in Each Column')
plt.show()

In [None]:
print(teams.dtypes)

In [None]:
teams_numeric = teams.copy()

for column in teams_numeric.columns:
    if teams_numeric[column].dtype == 'object':
        teams_numeric[column] = teams_numeric[column].astype('category').cat.codes

teams_numeric.describe()

In [None]:
def correlation_matrix(dataframe):
    corr_matrix = dataframe.corr()

    target_correlation = corr_matrix['playoff']

    plt.figure(figsize=(30, 20))

    mask = np.triu(np.ones_like(corr_matrix, dtype=bool))

    sns.heatmap(corr_matrix, mask=mask, annot=True, annot_kws={"size": 8}, cmap='coolwarm', linewidths=0.5, fmt=".2f")

    plt.title('Correlation Matrix', fontsize=16)
    plt.show()

    dict = {}

    for feature, correlation in target_correlation.items():
        print(f"Correlation between target and {feature}: {correlation}")
        dict[feature] = correlation

    return dict

In [None]:
correlation_matrix(teams_numeric)

In [None]:
def chi_square(dataset, target): 
    for feature in dataset.columns:
        if feature != target:
            contingency_table = pd.crosstab(dataset[feature], dataset[target])

            # check if any category has no data
            if contingency_table.shape[0] == 0 or contingency_table.shape[1] == 0:
                print(f"No data for {feature} and {target}")
                continue
            
            chi2, p, observed, expected = chi2_contingency(contingency_table)
            
            # Step 4: Print or store the results
            print(f"Chi-square test for {feature} and {target}:")
            print(f"Chi-square value: {chi2}")
            print(f"P-value: {p}")
            print("")

In [None]:
chi_square(teams, 'playoff')

The p-value indicates the probability of observing a relationship as extreme as the one in our sample data, assuming that there is no actual relationship in the population.

### Players metrics

In [None]:
print(players['playerID'].nunique()) 

print(players.head())

players.isnull().sum()

#### Erased columns and why

In [None]:
# Dropped columns: 'collegeOther', 'deathDate', 'firstseason', 'lastseason'

# Count the rows where 'firstseason' is not equal to 0
non_zero_firstseason_count = len(players[players['firstseason'] != 0])

# Count the rows where 'firstseason' is not equal to 0
non_zero_lastseason_count = len(players[players['lastseason'] != 0])

# Count the rows where 'deathDate' is not equal to "0000-00-00"
players['deathDate'] = players['deathDate'].str.strip()
non_empty_deathDate_count = len(players[players['deathDate'] != "0000-00-00"])

# Count the rows where 'collegeOther' is not equal to ""
non_nan_collegeOther_count = players['collegeOther'].notna().sum()

print("Number of rows with 'firstseason' different from 0:", non_zero_firstseason_count)
print("Number of rows with 'lastseason' different from 0:", non_zero_lastseason_count)
print("Number of rows with 'collegeOther' different from "":", non_nan_collegeOther_count)
print("Number of rows with 'deathDate' different from '0000-00-00':", non_empty_deathDate_count)

#### Players heights comparison

In [None]:
# Convert height from inches to centimeters
players['height_cm'] = players['height'] * 2.54  # 1 inch = 2.54 cm

# Define height categories in centimeters
height_categories = ['< 160.0 cm', '160.0 - 170.0 cm', '170.0 - 180.0 cm', '180.0 - 190.0 cm', '190.0 - 200.0 cm', '> 200.0 cm']

# Define the height ranges for each category
height_ranges = [(0, 160.0), (160.0, 170.0), (170.0, 180.0), (180.0, 190.0), (190.0, 200.0), (200.0, float('inf'))]

# Create a new column in the dataset to store the height category for each player
players['height_category'] = pd.cut(players['height_cm'], bins=[r[0] for r in height_ranges] + [float('inf')], labels=height_categories)

# Count the number of players in each height category
height_category_counts = players['height_category'].value_counts().reindex(height_categories, fill_value=0)

# Create a bar plot
plt.figure(figsize=(10, 6))
ax = sns.barplot(x=height_category_counts.index, y=height_category_counts.values, palette='Set2')

# Add labels to the bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height())}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points')

# Set plot title and labels
plt.title('Player Count in Height Categories')
plt.xlabel('Height Category')
plt.ylabel('Count')

# Show the plot
plt.show()

#### Number of players in each position

In [None]:
# Create a countplot for player positions
plt.figure(figsize=(10, 6))  # Adjust the figure size as needed
ax = sns.countplot(data=players, x='pos', order=players['pos'].value_counts().index, palette='Set2')

# Add labels to the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points')

# Set plot title and labels
plt.title('Number of Players in Each Position')
plt.xlabel('Position')
plt.ylabel('Count')

# Rotate x-axis labels for better readability (optional)
plt.xticks(rotation=45)

empty_pos_count = players['pos'].isnull().sum()
print("Number of rows with empty 'pos':", empty_pos_count)

# Show the plot
plt.show()

#### Top 10 Colleges

In [None]:
# Get the top 10 colleges with the most players
top_10_colleges = players['college'].value_counts().iloc[:10]

# Create a countplot for the top 10 colleges
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed
ax = sns.countplot(data=players, x='college', order=top_10_colleges.index, palette='Set2')

# Add labels to the bars
for p in ax.patches:
    ax.annotate(f'{p.get_height()}', (p.get_x() + p.get_width() / 2., p.get_height()), ha='center', va='center', fontsize=10, color='black', xytext=(0, 5), textcoords='offset points')

# Set plot title and labels
plt.title('Top 10 Colleges with the Most Players')
plt.xlabel('College')
plt.ylabel('Count')

# Rotate x-axis labels for better readability (optional)
plt.xticks(rotation=90)

# Show the plot
plt.show()

#### Correlation Matrix between numeric columns

In [None]:
# Select the columns for correlation analysis
numeric_columns = ["firstseason", "lastseason", "height", "weight"]

# Create a subset of the dataset with only the numeric columns
subset = players[numeric_columns]

# Calculate the correlation matrix
corr_matrix = subset.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm", fmt=".2f", linewidths=0.5)
plt.title('Correlation Matrix')
plt.show()

### Teams Post metrics

In [None]:
print(teams_post.head())

teams_post.isnull().sum()

In [None]:
# Calculate win-loss ratios
teams_post['Win-Loss Ratio'] = teams_post['W'] / (teams_post['W'] + teams_post['L'])

# Create a horizontal bar chart
plt.figure(figsize=(12, 8))
plt.barh(teams_post['tmID'], teams_post['Win-Loss Ratio'], color='skyblue')

# Add labels and title
plt.xlabel('Win-Loss Ratio')
plt.ylabel('Team ID (tmID)')
plt.title('Win-Loss Ratios for Teams on  Post-Season (based on tmID)')

# Show the chart
plt.show()

### Series Post metrics

In [None]:
# Group the data by year and count the number of wins for each team in each year
team_wins_by_year = series_post.groupby(['year', 'tmIDWinner'])['W'].count().unstack(fill_value=0)

# Create a stacked bar chart
plt.figure(figsize=(12, 8))
team_wins_by_year.plot(kind='bar', stacked=True, colormap='Set3')
plt.xlabel('Year')
plt.ylabel('Number of Wins')
plt.title('Teams That Won In The Playoffs Each Year')

# Show the chart
plt.legend(title='Team', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

#### Teams that won and lost each year

In [None]:
# Filter for the "F" (Finals) round
finals_data = series_post[series_post['round'] == 'F']

# Create a DataFrame with the winning and losing teams for each year
finals_results = finals_data[['year', 'tmIDWinner', 'tmIDLoser']]

# Convert the DataFrame to a prettily formatted table
table = tabulate(finals_results, headers='keys', tablefmt='fancy_grid', showindex=False)

# Display the formatted table
print(table)

In [None]:
# Filter for the "F" (Finals) round
finals_data = series_post[series_post['round'] == 'F']

# Count how many times each team has appeared in the Finals as either a winner or a loser
team_appearances = pd.concat([finals_data['tmIDWinner'], finals_data['tmIDLoser']]).value_counts()

# Create a horizontal bar chart
plt.figure(figsize=(12, 6))
team_appearances.plot(kind='barh', color='skyblue')

# Add labels and title
plt.ylabel('Team (tmID)')
plt.xlabel('Number of Finals Appearances')
plt.title('Total Appearances of Each Team in the Finals')

# Show the chart
plt.show()

### Coaches metrics

In [None]:
print(coaches['coachID'].nunique()) 

print(coaches.head())

coaches.isnull().sum()

In [None]:
# Extract the wins and losses data from the "won" and "lost" columns
coach_wins = coaches['won']
coach_losses = coaches['lost']

# Create a scatter plot
import matplotlib.pyplot as plt

plt.scatter(coach_wins, coach_losses, alpha=0.5)
plt.xlabel('Wins')
plt.ylabel('Losses')
plt.title('Scatter Plot of Coach Wins vs. Losses')
plt.show()

## Feature Selection

Identify and select relevant features for your prediction model. Use techniques such as correlation analysis, recursive feature elimination, or feature importance from tree-based models.

### Teams dataset

In [None]:
original_teams = teams.copy()

In [None]:
# to delete: lgID, divID, seeded, tmORB, tmDRB, tmTRB, opptmORB, opptmDRB, opptmTRB, rank, firstRound, semis, finals
print(teams.isna().sum())
   
feature_selection_result = teams.drop(columns=['lgID', 'divID', 'seeded', 'tmORB', 'tmDRB', 'tmTRB', 'opptmORB', 'opptmDRB', 'opptmTRB'])
feature_selection_result = feature_selection_result.drop(columns=['rank', 'firstRound', 'semis', 'finals'])
feature_selection_result = feature_selection_result.drop(columns=['min', 'o_oreb', 'o_dreb', 'd_oreb', 'd_dreb', 'name', 'franchID'])

feature_selection_result.head()

In [None]:
feature_selection_result.to_csv('filtered/feature_selection_dataset.csv', index=False)

## Feature Engineering

Create new features that might enhance the predictive power of your model. This could involve transforming existing features, creating interaction terms, or incorporating external data.

The feauture enginnering is done inside the `players.ipynb` file and the creation of the new dataset is done inside `create_final_team.csv` file.

In [None]:
# TODO: verificar se pode ficar assim ou se adicionamos o código cá
feature_engineering_result = pd.read_csv('filtered/feature_engineering_dataset.csv', delimiter=",")

### Shifting target variable

In [None]:
def shift_target_variable(dataset):
    dataset.sort_values(by=['tmID', 'year'], inplace=True)

    dataset['playoffs'] = dataset.groupby('tmID')['playoff'].shift(-1)

    dataset.drop(columns=['playoff'], inplace=True)

    dataset.dropna(subset=['playoffs'], inplace=True)

    dataset.rename(columns={'playoffs': 'playoff'}, inplace=True)

    dataset.reset_index(drop=True, inplace=True)

    return dataset

#### Original dataset

In [None]:
original_teams = shift_target_variable(original_teams)

In [None]:
original_teams_numeric = original_teams.copy()

for column in original_teams_numeric.columns:
    if original_teams_numeric[column].dtype == 'object':
        original_teams_numeric[column] = original_teams_numeric[column].astype('category').cat.codes

correlation_matrix(original_teams_numeric)

#### Feature Selection dataset

In [None]:
feature_selection_result = shift_target_variable(feature_selection_result)

In [None]:
feature_teams_numeric = feature_selection_result.copy()

for column in feature_teams_numeric.columns:
    if feature_teams_numeric[column].dtype == 'object':
        feature_teams_numeric[column] = feature_teams_numeric[column].astype('category').cat.codes

correlation_matrix(feature_teams_numeric)

#### Feature engineering dataset

In [None]:
eng1_teams_numeric = shift_target_variable(feature_engineering_result)

In [None]:
eng1_teams_numeric = eng1_teams_numeric.copy()

for column in eng1_teams_numeric.columns:
    if eng1_teams_numeric[column].dtype == 'object':
        eng1_teams_numeric[column] = eng1_teams_numeric[column].astype('category').cat.codes

correlation_matrix(eng1_teams_numeric)

In [None]:
feature_engineering_result2 = pd.read_csv('filtered/team2.csv', delimiter=",")
feature_engineering_result2.drop(columns=['playoff'], inplace=True)

feature_engineering_result2 = pd.merge(feature_engineering_result, feature_engineering_result2, on=['tmID', 'year'])

feature_engineering_result2.to_csv('text.csv', index=False)

feature_engineering_result2.head()

In [None]:
eng2_teams_numeric = feature_engineering_result2.copy()

for column in eng2_teams_numeric.columns:
    if eng2_teams_numeric[column].dtype == 'object':
        eng2_teams_numeric[column] = eng2_teams_numeric[column].astype('category').cat.codes

correlation_matrix(eng2_teams_numeric)

In [None]:
continuous = ['year', 'average_powerRanking', 'average_PER', 'average_postPowerRanking', 'average_postPER']

In [None]:
eng2_teams_numeric.head()

## Classification

### Encoding

#### Original

In [None]:
#tmID, confID, playoff, arena
# name, franchID, lgID, divID

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Apply label encoding to the columns 'teamID', 'franchID', 'confID', 'name', 'arena'
original_teams['tmID'] = label_encoder.fit_transform(original_teams['tmID'])
original_teams['confID'] = label_encoder.fit_transform(original_teams['confID'])
original_teams['arena'] = label_encoder.fit_transform(original_teams['arena'])
original_teams['name'] = label_encoder.fit_transform(original_teams['name'])
original_teams['franchID'] = label_encoder.fit_transform(original_teams['franchID'])
original_teams['lgID'] = label_encoder.fit_transform(original_teams['lgID'])
original_teams['divID'] = label_encoder.fit_transform(original_teams['divID'])
original_teams['firstRound'] = label_encoder.fit_transform(original_teams['firstRound'])
original_teams['semis'] = label_encoder.fit_transform(original_teams['semis'])
original_teams['finals'] = label_encoder.fit_transform(original_teams['finals'])

#### After Feature Selection

In [None]:
feature_selection_result['tmID'] = label_encoder.fit_transform(feature_selection_result['tmID'])
feature_selection_result['confID'] = label_encoder.fit_transform(feature_selection_result['confID'])
feature_selection_result['arena'] = label_encoder.fit_transform(feature_selection_result['arena'])

#### After feature engineering

In [None]:
# Apply label encoding to the columns 'teamID'
feature_engineering_result['tmID'] = label_encoder.fit_transform(feature_engineering_result['tmID'])

In [None]:
feature_engineering_result2['tmID'] = label_encoder.fit_transform(feature_engineering_result2['tmID'])
feature_engineering_result2['confID'] = label_encoder.fit_transform(feature_engineering_result2['confID'])
feature_engineering_result2['arena'] = label_encoder.fit_transform(feature_engineering_result2['arena'])
""" feature_engineering_result2['firstRound'] = label_encoder.fit_transform(feature_engineering_result2['firstRound'])
feature_engineering_result2['semis'] = label_encoder.fit_transform(feature_engineering_result2['semis'])
feature_engineering_result2['finals'] = label_encoder.fit_transform(feature_engineering_result2['finals']) """

### Models

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.svm import SVC

from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score, classification_report

#### Decision Tree

In [None]:
def decision_tree_model(dataset, year): 
    X_train = dataset[dataset['year'] < year].drop(columns=['playoff'])  
    y_train = dataset[dataset['year'] < year]['playoff']

    X_test = dataset[dataset['year'] == year].drop(columns=['playoff'])  
    y_test = dataset[dataset['year'] == year]['playoff']

    model = DecisionTreeClassifier(random_state=42)

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.2f}")
    
    print(classification_report(y_test, y_pred))

##### Original dataset

In [None]:
decision_tree_model(original_teams, 9)

##### Feature Selection dataset

In [None]:
decision_tree_model(feature_selection_result, 9)

##### Feature engineering dataset

In [None]:
decision_tree_model(feature_engineering_result, 9)

In [None]:
decision_tree_model(feature_engineering_result2, 9)

#### Random Forest

In [None]:
def random_forest_model(dataset, year): 
    X_train = dataset[dataset['year'] < year].drop(columns=['playoff'])
    y_train = dataset[dataset['year'] < year]['playoff']

    X_test = dataset[dataset['year'] == year].drop(columns=['playoff'])
    y_test = dataset[dataset['year'] == year]['playoff']

    rf_model = RandomForestClassifier(random_state=42)

    rf_model.fit(X_train, y_train)

    y_pred_rf = rf_model.predict(X_test)

    accuracy_rf = accuracy_score(y_test, y_pred_rf)
    print(f"Random Forest Accuracy: {accuracy_rf:.2f}")

    print(classification_report(y_test, y_pred_rf))

##### Original dataset

In [None]:
random_forest_model(original_teams, 9)

##### Feature Selection dataset

In [None]:
random_forest_model(feature_selection_result, 9)

##### Feature engineering dataset

In [None]:
random_forest_model(feature_engineering_result, 9)

In [None]:
random_forest_model(feature_engineering_result2, 9)

#### Logistic Regression

In [None]:
def logistic_regression_model(dataset, year): 
    X_train = dataset[dataset['year'] < year].drop(columns=['playoff'])
    y_train = dataset[dataset['year'] < year]['playoff']

    X_test = dataset[dataset['year'] == year].drop(columns=['playoff'])
    y_test = dataset[dataset['year'] == year]['playoff']

    logreg_model = LogisticRegression(random_state=42)

    logreg_model.fit(X_train, y_train)

    y_pred_logreg = logreg_model.predict(X_test)

    accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
    print(f"Logistic Regression Accuracy: {accuracy_logreg:.2f}")

    print(classification_report(y_test, y_pred_logreg))

##### Original dataset

In [None]:
logistic_regression_model(original_teams, 9)

##### Feature Selection dataset

In [None]:
logistic_regression_model(feature_selection_result, 9)

##### Feature Engineering dataset

In [None]:
logistic_regression_model(feature_engineering_result, 9)

In [None]:
logistic_regression_model(feature_engineering_result2, 9)

#### Support Vector Machines (SVM)

In [None]:
def svm_model(dataset, year): 
    X_train = dataset[dataset['year'] < year].drop(columns=['playoff'])
    y_train = dataset[dataset['year'] < year]['playoff']

    X_test = dataset[dataset['year'] == year].drop(columns=['playoff'])
    y_test = dataset[dataset['year'] == year]['playoff']

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    svm_model = SVC(random_state=42)

    svm_model.fit(X_train_scaled, y_train)

    y_pred_svm = svm_model.predict(X_test_scaled)

    accuracy_svm = accuracy_score(y_test, y_pred_svm)
    print(f"Support Vector Machine Accuracy: {accuracy_svm:.2f}")

    print(classification_report(y_test, y_pred_svm))

##### Original dataset

In [None]:
svm_model(original_teams, 9)

##### Feature Selection dataset

In [None]:
svm_model(feature_selection_result, 9)

##### Feature engineering dataset

In [None]:
svm_model(feature_engineering_result, 9)

In [None]:
svm_model(feature_engineering_result2, 9)

#### KNN dataset

In [None]:
def knn_model(dataset, year, n_neighbors=3):
    # Separate the training and testing sets
    X_train = dataset[dataset['year'] < year].drop(columns=['playoff'])
    y_train = dataset[dataset['year'] < year]['playoff']

    X_test = dataset[dataset['year'] == year].drop(columns=['playoff'])
    y_test = dataset[dataset['year'] == year]['playoff']

    # Create and train the KNN model
    knn_model = KNeighborsClassifier(n_neighbors=n_neighbors)
    knn_model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred_knn = knn_model.predict(X_test)

    # Evaluate and print the model performance
    accuracy_knn = accuracy_score(y_test, y_pred_knn)
    print(f"KNN Accuracy: {accuracy_knn:.2f}")

    print(classification_report(y_test, y_pred_knn))

##### Original dataset

In [None]:
knn_model(original_teams, 9)

##### Feature Selection dataset

In [None]:
knn_model(feature_selection_result, 9)

##### Feature engineering dataset

In [None]:
knn_model(feature_engineering_result, 9)

In [None]:
knn_model(feature_engineering_result2, 9)

## Model Evaluation