In [79]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [80]:
!pip install vaderSentiment



In [120]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

In [121]:
# Load the dataset
file_path = "/content/drive/MyDrive/Project3database.csv"
data = pd.read_csv(file_path)

In [None]:
data = columns

In [122]:
# Display basic information about the dataset
print(data.info())
print(data.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 83 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Year      720 non-null    int64  
 1   Tm        720 non-null    object 
 2   #Bat      720 non-null    int64  
 3   BatAge    720 non-null    float64
 4   R/G       720 non-null    float64
 5   G         720 non-null    int64  
 6   PA        720 non-null    int64  
 7   AB        720 non-null    int64  
 8   RS        720 non-null    int64  
 9   H         720 non-null    int64  
 10  2B        720 non-null    int64  
 11  3B        720 non-null    int64  
 12  HR        720 non-null    int64  
 13  RBI       720 non-null    int64  
 14  SB        720 non-null    int64  
 15  CS        720 non-null    int64  
 16  BB        720 non-null    int64  
 17  SO        720 non-null    int64  
 18  BA        720 non-null    float64
 19  OBP       720 non-null    float64
 20  SLG       720 non-null    float6

In [123]:
# Data Exploration
# Check for missing values
print(data.isnull().sum())

Year          0
Tm            0
#Bat          0
BatAge        0
R/G           0
           ... 
Rdrs        390
Rdrs/yr      90
Rgood       124
Playoff       0
Champion      0
Length: 83, dtype: int64


In [124]:
# Statistical summary
print(data.describe())


              Year        #Bat      BatAge         R/G           G  \
count   720.000000  720.000000  720.000000  720.000000  720.000000   
mean   2011.500000   47.930556   28.660278    4.564194  157.708333   
std       6.926999    6.065535    1.307156    0.510573   20.417913   
min    2000.000000   34.000000   25.400000    3.170000   58.000000   
25%    2005.750000   44.000000   27.800000    4.220000  162.000000   
50%    2011.500000   47.000000   28.600000    4.540000  162.000000   
75%    2017.250000   51.000000   29.500000    4.900000  162.000000   
max    2023.000000   69.000000   33.500000    6.040000  163.000000   

                PA           AB          RS            H          2B  ...  \
count   720.000000   720.000000  720.000000   720.000000  720.000000  ...   
mean   6029.109722  5378.838889  719.450000  1384.966667  276.601389  ...   
std     804.285879   716.705082  122.748055   207.974680   47.398623  ...   
min    2011.000000  1752.000000  219.000000   390.000000   73

In [125]:
# Data Cleaning and Preprocessing
# Fill or drop missing values, if any
data = data.dropna()

In [126]:
# Feature Engineering
# Create relevant features for playoff prediction
data['Run_Difference'] = data['RS'] - data['RA']
# Add any other feature engineering steps here

In [127]:
# Select features and target variable
features = data[['Run_Difference', 'HR', 'ERA', 'W']]
target = data['Playoff']

In [128]:
features.shape

(326, 4)

In [129]:
target.shape

(326,)

In [130]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)


In [131]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [132]:
# Train a RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [133]:
# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print("Random Forest Classifier Evaluation")
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Random Forest Classifier Evaluation
              precision    recall  f1-score   support

           0       1.00      0.93      0.96        44
           1       0.88      1.00      0.94        22

    accuracy                           0.95        66
   macro avg       0.94      0.97      0.95        66
weighted avg       0.96      0.95      0.96        66

[[41  3]
 [ 0 22]]


In [134]:
# Select the needed columns
selected_columns = data[['Run_Difference', 'HR', 'ERA', 'W']]

# Display the first few rows of the selected columns
selected_columns.head()

Unnamed: 0,Run_Difference,HR,ERA,W
0,-15,166,4.48,84
1,231,307,4.14,104
2,129,183,3.89,101
3,-4,182,4.52,78
4,96,196,4.08,83


In [147]:
# Ask the user for the year
year = int(input("Please enter the year: "))

Please enter the year: 2020


In [136]:
# Check data types and unique values in the 'Year' column
print("Data types in the dataset:")
print(data.dtypes)

print("\nUnique values in the 'Year' column:")
print(data['Year'].unique())

Data types in the dataset:
Year                int64
Tm                 object
#Bat                int64
BatAge            float64
R/G               float64
                   ...   
Rdrs/yr           float64
Rgood             float64
Playoff             int64
Champion            int64
Run_Difference      int64
Length: 84, dtype: object

Unique values in the 'Year' column:
[2023 2022 2021 2020 2019 2018 2017 2016 2015 2014 2013]


In [137]:
# Check if the year exists in the dataset
if year not in data['Year'].values:
    print(f"The year {year} is not in the dataset.")
else:
    # Filter the dataset for the desired year
    filtered_data = data[data['Year'] == year]

    # Display filtered data to debug
    print(f"Filtered data for the year {year}:")
    print(filtered_data.head())

Filtered data for the year 2019:
     Year                    Tm  #Bat  BatAge   R/G    G    PA    AB   RS  \
120  2019  Arizona Diamondbacks    45    28.7  5.02  162  6315  5633  813   
121  2019        Atlanta Braves    50    28.0  5.28  162  6302  5560  855   
122  2019     Baltimore Orioles    58    26.5  4.50  162  6189  5596  729   
123  2019        Boston Red Sox    47    27.3  5.56  162  6475  5770  901   
124  2019          Chicago Cubs    52    27.7  5.02  162  6195  5461  814   

        H  ...   DP   Fld%  Rtot  Rtot/yr  Rdrs  Rdrs/yr  Rgood  Playoff  \
120  1419  ...  136  0.986    52      5.0  91.0      3.0    9.0        0   
121  1432  ...  154  0.987     5      0.0  22.0      2.0   -2.0        1   
122  1379  ...  155  0.982     3      0.0 -53.0     -5.0   -9.0        0   
123  1554  ...  115  0.985   -15     -1.0  10.0      0.0   -8.0        0   
124  1378  ...  141  0.981    12      1.0  32.0      1.0    5.0        0   

     Champion  Run_Difference  
120         0  

In [138]:
# Filter the dataset for the desired year
filtered_data = data[data['Year'] == year]

In [139]:
# Select the relevant columns
relevant_columns = filtered_data[['Tm', 'Run_Difference', 'HR', 'ERA', 'W']]

In [140]:
# To predict playoff teams for a given year, create a DataFrame with the relevant features for that year
future_data = pd.DataFrame(relevant_columns)
future_data.head()


Unnamed: 0,Tm,Run_Difference,HR,ERA,W
120,Arizona Diamondbacks,70,220,4.25,85
121,Atlanta Braves,112,249,4.19,97
122,Baltimore Orioles,-252,213,5.59,54
123,Boston Red Sox,73,245,4.7,84
124,Chicago Cubs,97,256,4.1,84


In [141]:
# Standardize the features
features = future_data.drop(columns=["Tm"])  # Drop team names
scaler = StandardScaler().fit(features)  # Fit scaler on features
future_data_scaled = scaler.transform(features)  # Transform features

# Create a new DataFrame with the scaled data and the team names
scaled_df = pd.DataFrame(future_data_scaled, columns=features.columns)
scaled_df["Tm"] = future_data["Tm"].reset_index(drop=True)  # Add team names back
scaled_df

Unnamed: 0,Run_Difference,HR,ERA,W,Tm
0,0.454927,-0.143113,-0.431499,0.258015,Arizona Diamondbacks
1,0.727883,0.56432,-0.536317,1.025661,Atlanta Braves
2,-1.637736,-0.313873,1.909428,-1.725072,Baltimore Orioles
3,0.474424,0.466743,0.354633,0.194044,Boston Red Sox
4,0.630398,0.73508,-0.693544,0.194044,Chicago Cubs
5,-0.80587,-1.070094,0.704025,-0.573602,Chicago White Sox
6,-0.06499,0.027647,-0.553787,-0.381691,Cincinnati Reds
7,0.727883,-0.06993,-1.28751,0.769779,Cleveland Indians
8,-0.799371,-0.045536,1.85702,-0.637573,Colorado Rockies
9,-2.164151,-1.875104,1.297992,-2.172866,Detroit Tigers


In [142]:
# Standardize the features
#future_data_scaled = scaler.transform(future_data)


In [143]:
# Predict playoff teams
future_predictions = rf_model.predict(future_data_scaled)
print("Future Playoff Predictions")
print(future_predictions)

Future Playoff Predictions
[0 1 0 0 0 0 0 1 0 0 1 0 0 1 0 0 1 0 1 1 0 0 0 0 0 1 1 0 0 1]


In [144]:
# Create a new DataFrame with the scaled data and the team names
scaled_df = pd.DataFrame(future_data_scaled, columns=features.columns)
scaled_df["Tm"] = future_data["Tm"]  # Add team names back

In [145]:
# Interpret the predictions (assuming binary classification with 1 indicating playoff appearance)
playoff_teams = future_data[future_predictions == 1].reset_index(drop=True)
print("Teams predicted to be in playoffs:")
print (playoff_teams)

Teams predicted to be in playoffs:
                     Tm  Run_Difference   HR   ERA    W
0        Atlanta Braves             112  249  4.19   97
1     Cleveland Indians             112  223  3.76   93
2        Houston Astros             280  288  3.66  107
3   Los Angeles Dodgers             273  279  3.37  106
4       Minnesota Twins             185  307  4.18  101
5      New York Yankees             204  306  4.31  103
6     Oakland Athletics             165  257  3.97   97
7   St. Louis Cardinals             102  210  3.80   91
8        Tampa Bay Rays             113  217  3.65   96
9  Washington Nationals             149  231  4.27   93
