In [63]:
import pandas as pd

# Load the dataset
file_path = r'C:\Users\wziller\Milwaukee Tool\_Global AME\MSOE\CS5610\M6\csc5610base\GroupProjectM6\Awards_Batting_HoF.csv'
fe_batting = pd.read_csv(file_path)

# Perform feature engineering


# Group by playerID and aggregate relevant statistics
grouped_df = fe_batting.groupby('playerID').agg(
    years_played=('Batting_df_yearID', lambda x: x.nunique()),
    total_games=('Batting_df_G', 'sum'),
    total_at_bats=('Batting_df_AB', 'sum'),
    total_runs=('Batting_df_R', 'sum'),
    total_hits=('Batting_df_H', 'sum'),
    total_walks=('Batting_df_BB', 'sum'),
    total_doubles=('Batting_df_2B', 'sum'),
    total_triples=('Batting_df_3B', 'sum'),
    total_home_runs=('Batting_df_HR', 'sum'),
    total_RBI=('Batting_df_RBI', 'sum'),
    total_SB=('Batting_df_SB', 'sum'),
    total_CS=('Batting_df_CS', 'sum'),
    total_BB=('Batting_df_BB', 'sum'),
    total_SO=('Batting_df_SO', 'sum'),
    total_IBB=('Batting_df_IBB', 'sum'),
    total_HBP=('Batting_df_HBP', 'sum'),
    total_SH=('Batting_df_SH', 'sum'),
    total_SF=('Batting_df_SF', 'sum'),
    total_GIDP=('Batting_df_GIDP', 'sum'),
    max_HR=('Batting_df_HR', 'max'),
    max_hits=('Batting_df_H', 'max'),
    max_SB=('Batting_df_SB', 'max'),
    HOF_status=('HallOfFame_df_inducted', 'max')
    # HOF_year=('HallOfFame_df_yearid', 'max'),
    # BattingPost_G=('BattingPost_df_G', 'sum'),
    # BattingPost_AB=('BattingPost_df_AB', 'sum'),
    # BattingPost_R=('BattingPost_df_R', 'sum'),
    # BattingPost_H=('BattingPost_df_H', 'sum'),
    # BattingPost_2B=('BattingPost_df_2B', 'sum'),
    # BattingPost_3B=('BattingPost_df_3B', 'sum'),
    # BattingPost_HR=('BattingPost_df_HR', 'sum'),
    # BattingPost_RBI=('BattingPost_df_RBI', 'sum'),
    # BattingPost_SB=('BattingPost_df_SB', 'sum'),
    # BattingPost_CS=('BattingPost_df_CS', 'sum'),
    # BattingPost_BB=('BattingPost_df_BB', 'sum'),
    # BattingPost_SO=('BattingPost_df_SO', 'sum'),
    # BattingPost_IBB=('BattingPost_df_IBB', 'sum'),
    # BattingPost_HBP=('BattingPost_df_HBP', 'sum'),
    # BattingPost_SH=('BattingPost_df_SH', 'sum'),
    # BattingPost_SF=('BattingPost_df_SF', 'sum'),
    # BattingPost_GIDP=('BattingPost_df_GIDP', 'sum')
).reset_index()

grouped_df['total_singles'] = grouped_df['total_hits'] - (grouped_df['total_doubles'] + grouped_df['total_triples'] + grouped_df['total_home_runs'])

# # Merge back with the original DataFrame to preserve all columns
# merge_df = pd.merge(grouped_df, grouped_df, on='playerID', how='left')

# Calculate new features
grouped_df['batting_avg'] = grouped_df['total_hits'] / grouped_df['total_at_bats']
grouped_df['OBP'] = (grouped_df['total_hits'] + grouped_df['total_walks']) / (grouped_df['total_at_bats'] + grouped_df['total_walks'])
grouped_df['SLG'] = (
    (grouped_df['total_singles'] - grouped_df['total_doubles'] - grouped_df['total_triples'] - grouped_df['total_home_runs']) +
    (2 * grouped_df['total_doubles']) +
    (3 * grouped_df['total_triples']) +
    (4 * grouped_df['total_home_runs'])
) / grouped_df['total_at_bats']
grouped_df['OPS'] = grouped_df['OBP'] + grouped_df['SLG']

#27 

# Replace empty values in engineered features with 0
engineered_features = [
    'years_played', 'total_games', 'total_at_bats', 'total_runs', 'total_hits',
    'total_walks', 'total_doubles', 'total_triples', 'total_home_runs',
    'total_RBI', 'total_SB', 'total_CS', 'total_BB', 'total_SO', 'total_IBB',
    'total_HBP', 'total_SH', 'total_SF', 'total_GIDP', 'max_HR', 'max_hits',
    'max_SB', 'total_singles', 'batting_avg', 'OBP', 'SLG', 'OPS'
]

# Fill NaN values with 0
grouped_df[engineered_features] = grouped_df[engineered_features].fillna(0)

# Row count before filtering
print(f"Original dataset size: {len(grouped_df)}")

# Apply minimum thresholds for filtering
# min_games = 50
min_at_bats = 100
# min_ops = 0.1
min_years = 10

filtered_df = grouped_df[
    (grouped_df['years_played'] >= min_years) &
    # (grouped_df['total_games'] >= min_games) &
    (grouped_df['total_at_bats'] >= min_at_bats) 
    # (grouped_df['OPS'] >= min_ops)
]

# Row count after filtering
print(f"Filtered dataset size: {len(filtered_df)}")


# Convert data types
filtered_df['HOF_status'] = filtered_df['HOF_status'].astype('category')
filtered_df['playerID'] = filtered_df['playerID'].astype('string')



# # Identify HOF players in the original dataset
# original_hof_players = grouped_df[grouped_df['HOF_status'] == 1]['playerID']

# # Identify HOF players in the filtered dataset
# filtered_hof_players = filtered_df[filtered_df['HOF_status'] == 1]['playerID']

# # Compare and identify missing HOF players
# hof_players_filtered_out = original_hof_players[~original_hof_players.isin(filtered_hof_players)]

# # Display the result
# if hof_players_filtered_out.empty:
#     print("No Hall of Fame players were filtered out.")
# else:
#     print("The following Hall of Fame players were filtered out:")
#     print(hof_players_filtered_out)


# Save filtered DataFrame
filtered_df.to_csv('Award_Batting_HOF_FE_csv.csv', index=False)
filtered_df.to_feather('Award_Batting_HOF_FE.feather')

filtered_df.info()


# # Save filtered DataFrame
# grouped_df.to_csv('Award_Batting_HOF_FE_csv.csv', index=False)
# grouped_df.to_feather('Award_Batting_HOF_FE.feather')

# grouped_df.info()

Original dataset size: 18724
Filtered dataset size: 2898
<class 'pandas.core.frame.DataFrame'>
Index: 2898 entries, 1 to 18713
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   playerID         2898 non-null   string  
 1   years_played     2898 non-null   int64   
 2   total_games      2898 non-null   float64 
 3   total_at_bats    2898 non-null   float64 
 4   total_runs       2898 non-null   float64 
 5   total_hits       2898 non-null   float64 
 6   total_walks      2898 non-null   float64 
 7   total_doubles    2898 non-null   float64 
 8   total_triples    2898 non-null   float64 
 9   total_home_runs  2898 non-null   float64 
 10  total_RBI        2898 non-null   float64 
 11  total_SB         2898 non-null   float64 
 12  total_CS         2898 non-null   float64 
 13  total_BB         2898 non-null   float64 
 14  total_SO         2898 non-null   float64 
 15  total_IBB        2898 non-null   flo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['HOF_status'] = filtered_df['HOF_status'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['playerID'] = filtered_df['playerID'].astype('string')


In [64]:
# grouped_df.head()
filtered_df.head()


Unnamed: 0,playerID,years_played,total_games,total_at_bats,total_runs,total_hits,total_walks,total_doubles,total_triples,total_home_runs,...,total_GIDP,max_HR,max_hits,max_SB,HOF_status,total_singles,batting_avg,OBP,SLG,OPS
1,aaronha01,23,3298.0,12364.0,2174.0,3771.0,1402.0,624.0,98.0,755.0,...,328.0,47.0,223.0,31.0,True,2294.0,0.304998,0.375781,0.435053,0.810834
25,abernte02,14,681.0,181.0,12.0,25.0,6.0,3.0,0.0,0.0,...,4.0,0.0,4.0,0.0,False,22.0,0.138122,0.165775,0.138122,0.303897
33,abreubo01,18,2425.0,8480.0,1453.0,2470.0,1476.0,574.0,59.0,288.0,...,165.0,31.0,183.0,40.0,False,1549.0,0.291274,0.396344,0.366156,0.7625
55,adairje01,13,1165.0,4019.0,378.0,1022.0,208.0,163.0,19.0,57.0,...,149.0,11.0,153.0,7.0,False,783.0,0.254292,0.290987,0.287385,0.578371
60,adamsba01,19,482.0,1019.0,79.0,216.0,53.0,31.0,15.0,3.0,...,0.0,1.0,33.0,1.0,False,167.0,0.211973,0.250933,0.232581,0.483514


In [65]:
# grouped_df.describe()
filtered_df.describe()

Unnamed: 0,years_played,total_games,total_at_bats,total_runs,total_hits,total_walks,total_doubles,total_triples,total_home_runs,total_RBI,...,total_SF,total_GIDP,max_HR,max_hits,max_SB,total_singles,batting_avg,OBP,SLG,OPS
count,2898.0,2898.0,2898.0,2898.0,2898.0,2898.0,2898.0,2898.0,2898.0,2898.0,...,2898.0,2898.0,2898.0,2898.0,2898.0,2898.0,2898.0,2898.0,2898.0,2898.0
mean,13.481021,1162.531401,3702.406832,516.831608,1005.652864,359.163561,172.131125,35.824362,81.376812,473.518979,...,18.633885,58.71049,12.095238,113.700828,13.493789,716.320566,0.237302,0.293405,0.277594,0.571
std,3.128667,832.494354,3352.927175,562.771005,1017.838383,405.19406,180.856891,53.427247,123.693842,527.784207,...,26.904121,72.90081,12.630974,70.955541,17.843317,719.704051,0.057119,0.070136,0.079148,0.146865
min,10.0,140.0,100.0,1.0,6.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,0.0,3.0,0.044554,0.066351,0.044554,0.123767
25%,11.0,533.25,937.25,82.0,189.0,50.0,27.0,4.0,5.0,78.0,...,0.0,2.0,2.0,28.0,1.0,146.25,0.208139,0.24981,0.228736,0.480428
50%,13.0,1038.0,3191.0,389.5,824.0,256.0,136.0,20.0,34.0,350.0,...,3.0,34.0,8.0,134.0,7.0,583.5,0.255694,0.315215,0.296192,0.614803
75%,15.0,1482.75,5138.5,722.0,1400.75,486.0,243.0,46.0,105.0,636.0,...,31.0,93.0,19.0,173.75,19.0,1001.0,0.275159,0.341329,0.333095,0.672707
max,27.0,5700.0,21752.0,3992.0,7028.0,3416.0,1584.0,618.0,1146.0,4152.0,...,236.0,630.0,73.0,262.0,138.0,5286.0,0.366363,0.480621,0.52834,1.000137
