In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install nltk


In [None]:
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Download VADER lexicon (only need to do this once)
nltk.download('vader_lexicon')

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Load the dataset
file_path = "/content/drive/MyDrive/Project3database.csv"
data = pd.read_csv(file_path)

# Ensure 'Year' and 'Tm' columns are present
if 'Year' not in data.columns or 'Tm' not in data.columns:
    raise ValueError("'Year' and 'Tm' columns are required in the dataset.")

# Select relevant features for prediction
features = ['R/G', 'G', 'PA', 'AB', 'RS', 'H', '2B', '3B', 'HR', 'RBI', 'SB', 'CS', 'BB', 'SO', 'BA', 'OBP', 'SLG', 'OPS']

# Ensure there are no missing values in the features
data_filtered = data.dropna(subset=features)

# Generate random binary target data for demonstration purposes (since the real target data is cleared)
data_filtered['Playoff'] = np.random.randint(2, size=len(data_filtered))
data_filtered['Champion'] = np.random.randint(2, size=len(data_filtered))

# Define the feature set and target variables
X = data_filtered[features]
y_playoff = data_filtered['Playoff']
y_champion = data_filtered['Champion']

# Standardize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split the data into training and testing sets for playoff prediction
X_train, X_test, y_playoff_train, y_playoff_test = train_test_split(X_scaled, y_playoff, test_size=0.2, random_state=42)

# Train a RandomForestClassifier model for playoff prediction
model_playoff = RandomForestClassifier(n_estimators=100, random_state=42)
model_playoff.fit(X_train, y_playoff_train)

# Make predictions on the test set for playoff prediction
y_playoff_pred = model_playoff.predict(X_test)
playoff_accuracy = accuracy_score(y_playoff_test, y_playoff_pred)
print("Playoff Prediction Accuracy:", playoff_accuracy)

# Evaluate the playoff model
print("Random Forest Classifier Evaluation for Playoff Prediction")
print(classification_report(y_playoff_test, y_playoff_pred))
print(confusion_matrix(y_playoff_test, y_playoff_pred))

# Split the data into training and testing sets for champion prediction
_, _, y_champion_train, y_champion_test = train_test_split(X_scaled, y_champion, test_size=0.2, random_state=42)

# Train a RandomForestClassifier model for champion prediction
model_champion = RandomForestClassifier(n_estimators=100, random_state=42)
model_champion.fit(X_train, y_champion_train)

# Make predictions on the test set for champion prediction
y_champion_pred = model_champion.predict(X_test)
champion_accuracy = accuracy_score(y_champion_test, y_champion_pred)
print("Champion Prediction Accuracy:", champion_accuracy)

# Evaluate the champion model
print("Random Forest Classifier Evaluation for Champion Prediction")
print(classification_report(y_champion_test, y_champion_pred))
print(confusion_matrix(y_champion_test, y_champion_pred))

# Neural network model for playoff prediction
model_playoff_nn = Sequential([
    Dense(64, activation='relu', input_shape=(X_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_playoff_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_playoff_nn.fit(X_train, y_playoff_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the neural network model for playoff prediction
playoff_nn_accuracy = model_playoff_nn.evaluate(X_test, y_playoff_test)
print("Playoff Neural Network Prediction Accuracy:", playoff_nn_accuracy[1])

# Neural network model for champion prediction
model_champion_nn = Sequential([
    Dense(64, activation='relu', input_shape=(X_scaled.shape[1],)),
    Dense(32, activation='relu'),
    Dense(1, activation='sigmoid')
])

model_champion_nn.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model_champion_nn.fit(X_train, y_champion_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the neural network model for champion prediction
champion_nn_accuracy = model_champion_nn.evaluate(X_test, y_champion_test)
print("Champion Neural Network Prediction Accuracy:", champion_nn_accuracy[1])

# Function to extract relevant text for a team
def extract_relevant_text(team_name, article):
    sentences = article.split('.')
    relevant_sentences = [sentence for sentence in sentences if team_name in sentence]
    return ' '.join(relevant_sentences)

# Function to analyze sentiment
def analyze_sentiment(text, context):
    combined_text = f"{context}\n{text}"
    sentiment_scores = sia.polarity_scores(combined_text)
    return sentiment_scores

# Function to predict playoff teams and champion for a given year
def predict_for_year(year):
    if year not in data['Year'].values:
        print(f"The year {year} is not in the dataset.")
        return

    filtered_data = data[data['Year'] == year].copy()  # Use .copy() to avoid the SettingWithCopyWarning

    # Ensure the same columns, in the same order, are used when transforming `future_data`
    relevant_columns = X.columns
    future_data = filtered_data[relevant_columns]

    # Standardize the features
    future_data_scaled = scaler.transform(future_data)

    # Get the required number of playoff teams based on the year
    if 2000 <= year <= 2011:
        n_playoff_teams = 8
    elif 2012 <= year <= 2019:
        n_playoff_teams = 10
    elif year == 2020:
        n_playoff_teams = 16
    elif 2022 <= year <= 2024:
        n_playoff_teams = 12
    else:
        raise ValueError("Year is out of the expected range for this analysis")

    # Predict playoff teams
    future_playoff_predictions_proba = model_playoff.predict_proba(future_data_scaled)[:, 1]
    filtered_data['Playoff_Prob'] = future_playoff_predictions_proba

    # Select top N teams based on playoff probability
    playoff_teams = filtered_data.nlargest(n_playoff_teams, 'Playoff_Prob')
    print("Teams predicted to be in playoffs:")
    print(playoff_teams[['Tm', 'Playoff_Prob']])

    # Predict champions among playoff teams
    playoff_teams_scaled = scaler.transform(playoff_teams[relevant_columns])
    future_champion_predictions_proba = model_champion.predict_proba(playoff_teams_scaled)[:, 1]
    playoff_teams['Champion_Prob'] = future_champion_predictions_proba

    # Select the champion (team with the highest probability)
    champion_team = playoff_teams.loc[playoff_teams['Champion_Prob'].idxmax()]
    print("Team predicted to be the champion:")
    print(champion_team[['Tm', 'Champion_Prob']])

    # Sample article text for sentiment context
    article_text = """
New York Yankees: Renowned for their winning history, including 27 World Series titles and a massive fan base.

Toronto Blue Jays: A beloved team with a strong Canadian following, known for their back-to-back World Series wins and appealing throwback designs.

Boston Red Sox: Celebrated for their loyal fan base and overcoming a long championship drought with three titles in the 21st century.

San Francisco Giants: A team with multiple World Series victories and a dedicated fan base, praised for their consistent performance.

St. Louis Cardinals: Known for their sportsmanship and a warm fan culture, they have a rich history in baseball.

Los Angeles Dodgers: A team with a powerful roster and a history of postseason appearances, though they have faced criticism for past controversies.

Chicago Cubs: Famous for their devoted fan base and the historic Wrigley Field, they have a long-standing tradition despite a championship drought.

Pittsburgh Pirates: Loved by local fans for their unpredictability, they evoke strong emotions and memories from loyal supporters.

Atlanta Braves: Known for their competitive spirit and recent success, they have a passionate fan base that remains hopeful for future championships.

New York Mets: Often seen as the underdogs compared to the Yankees, they have loyal fans who support them through ups and downs.

Baltimore Orioles: Appreciated for their charming ballpark and history, they have loyal fans despite recent struggles.

Oakland Athletics: Known for their small budget yet competitive spirit, they are a fun team with young talent and a dedicated fan base.

Philadelphia Phillies: Gaining respect for their potential, they have loyal fans who support them through challenges.

Seattle Mariners: Featuring a history of great players, they have dedicated fans who believe in their potential for future success.

Cleveland Guardians: Valued for their competitive spirit and young talent, they have fans who believe in their potential to improve.

Cleveland Indians:  Valued for their competitive spirit and young talent, they have fans who believe in their potential to improve.

Detroit Tigers: Recognized for their history and sportsmanship, they have dedicated fans who remember their past successes.

Los Angeles Angels: With star players like Mike Trout, they have loyal supporters who believe in their potential to make a playoff run.

Kansas City Royals: Known for their loyal fan base and recent success, they are celebrated for their championship win in 2015.

Minnesota Twins: With a strong history and a dedicated fan base, they are celebrated for their World Series wins.

Tampa Bay Rays: Known for their low budget but competitive spirit, they consistently challenge more affluent teams in the league.

Milwaukee Brewers: Recognized for their passionate fan base, they have a history of loyalty despite not winning a World Series.

Cincinnati Reds: Celebrated as the first professional baseball team, they have a storied history and a loyal fan base.

Chicago White Sox: Often overshadowed by the Cubs, they have a strong fan base and a history of success.

Texas Rangers: Known for their dedicated fans, they are appreciated for their community spirit and near-misses in the World Series.

Washington Nationals: A young team with potential, they have fans looking forward to future success.

Arizona Diamondbacks: Known for winning the World Series early in their history, they have a passionate local following.

San Diego Padres: A team with local pride, they have loyal fans despite ups and downs in performance.

Houston Astros: Recently successful, they have a strong roster and a fan base that supports them through controversies.

Colorado Rockies: A team with a beautiful ballpark, they evoke pride in local fans despite a mixed history.

Miami Marlins: Often viewed as an underdog, they have potential and young talent but struggle with fan engagement.

Florida Marlins: Often viewed as an underdog, they have potential and young talent but struggle with fan engagement.
"""

    # Extract relevant text for sentiment analysis
    team_name = champion_team['Tm']
    relevant_text = extract_relevant_text(team_name, article_text)

    # Analyze sentiment based on the team name and article context
    sentiment = analyze_sentiment(team_name, relevant_text)
    print(f"Sentiment for {team_name}: {sentiment}")

# Ask the user for a year and make predictions
try:
    year_to_predict = int(input("Enter a year to predict playoffs and champion: "))
    predict_for_year(year_to_predict)
except ValueError:
    print("Please enter a valid year.")