In [24]:
import pandas as pd

# Load the dataset
file_path = r'C:\Users\wziller\Milwaukee Tool\_Global AME\MSOE\CS5610\M6\csc5610base\GroupProjectM6\Awards_Batting_HoF.csv'
fe_batting = pd.read_csv(file_path)

# Perform feature engineering


# Group by playerID and aggregate relevant statistics
grouped_df = fe_batting.groupby('playerID').agg(
    years_played=('Batting_df_yearID', lambda x: x.nunique()),
    total_games=('Batting_df_G', 'sum'),
    total_at_bats=('Batting_df_AB', 'sum'),
    total_runs=('Batting_df_R', 'sum'),
    total_hits=('Batting_df_H', 'sum'),
    total_walks=('Batting_df_BB', 'sum'),
    total_doubles=('Batting_df_2B', 'sum'),
    total_triples=('Batting_df_3B', 'sum'),
    total_home_runs=('Batting_df_HR', 'sum'),
    total_RBI=('Batting_df_RBI', 'sum'),
    total_SB=('Batting_df_SB', 'sum'),
    total_CS=('Batting_df_CS', 'sum'),
    total_BB=('Batting_df_BB', 'sum'),
    total_SO=('Batting_df_SO', 'sum'),
    total_IBB=('Batting_df_IBB', 'sum'),
    total_HBP=('Batting_df_HBP', 'sum'),
    total_SH=('Batting_df_SH', 'sum'),
    total_SF=('Batting_df_SF', 'sum'),
    total_GIDP=('Batting_df_GIDP', 'sum'),
    max_HR=('Batting_df_HR', 'max'),
    max_hits=('Batting_df_H', 'max'),
    max_SB=('Batting_df_SB', 'max'),
    HOF_status=('HallOfFame_df_inducted', 'max')
    # HOF_year=('HallOfFame_df_yearid', 'max'),
    # BattingPost_G=('BattingPost_df_G', 'sum'),
    # BattingPost_AB=('BattingPost_df_AB', 'sum'),
    # BattingPost_R=('BattingPost_df_R', 'sum'),
    # BattingPost_H=('BattingPost_df_H', 'sum'),
    # BattingPost_2B=('BattingPost_df_2B', 'sum'),
    # BattingPost_3B=('BattingPost_df_3B', 'sum'),
    # BattingPost_HR=('BattingPost_df_HR', 'sum'),
    # BattingPost_RBI=('BattingPost_df_RBI', 'sum'),
    # BattingPost_SB=('BattingPost_df_SB', 'sum'),
    # BattingPost_CS=('BattingPost_df_CS', 'sum'),
    # BattingPost_BB=('BattingPost_df_BB', 'sum'),
    # BattingPost_SO=('BattingPost_df_SO', 'sum'),
    # BattingPost_IBB=('BattingPost_df_IBB', 'sum'),
    # BattingPost_HBP=('BattingPost_df_HBP', 'sum'),
    # BattingPost_SH=('BattingPost_df_SH', 'sum'),
    # BattingPost_SF=('BattingPost_df_SF', 'sum'),
    # BattingPost_GIDP=('BattingPost_df_GIDP', 'sum')
).reset_index()

grouped_df['total_singles'] = grouped_df['total_hits'] - (grouped_df['total_doubles'] + grouped_df['total_triples'] + grouped_df['total_home_runs'])

# # Merge back with the original DataFrame to preserve all columns
# merge_df = pd.merge(grouped_df, grouped_df, on='playerID', how='left')

# Calculate new features
grouped_df['batting_avg'] = grouped_df['total_hits'] / grouped_df['total_at_bats']
grouped_df['OBP'] = (grouped_df['total_hits'] + grouped_df['total_walks']) / (grouped_df['total_at_bats'] + grouped_df['total_walks'])
grouped_df['SLG'] = (
    (grouped_df['total_singles'] - grouped_df['total_doubles'] - grouped_df['total_triples'] - grouped_df['total_home_runs']) +
    (2 * grouped_df['total_doubles']) +
    (3 * grouped_df['total_triples']) +
    (4 * grouped_df['total_home_runs'])
) / grouped_df['total_at_bats']
grouped_df['OPS'] = grouped_df['OBP'] + grouped_df['SLG']

#27 

# Replace empty values in engineered features with 0
engineered_features = [
    'years_played', 'total_games', 'total_at_bats', 'total_runs', 'total_hits',
    'total_walks', 'total_doubles', 'total_triples', 'total_home_runs',
    'total_RBI', 'total_SB', 'total_CS', 'total_BB', 'total_SO', 'total_IBB',
    'total_HBP', 'total_SH', 'total_SF', 'total_GIDP', 'max_HR', 'max_hits',
    'max_SB', 'total_singles', 'batting_avg', 'OBP', 'SLG', 'OPS'
]

# Fill NaN values with 0
grouped_df[engineered_features] = grouped_df[engineered_features].fillna(0)

# Row count before filtering
print(f"Original dataset size: {len(grouped_df)}")

# Apply minimum thresholds for filtering
min_games = 50
min_at_bats = 100
min_ops = 0.25

filtered_df = grouped_df[
    (grouped_df['total_games'] >= min_games) &
    (grouped_df['total_at_bats'] >= min_at_bats) &
    (grouped_df['OPS'] >= min_ops)
]

# Row count after filtering
print(f"Filtered dataset size: {len(filtered_df)}")


# Convert data types
filtered_df['HOF_status'] = filtered_df['HOF_status'].astype('category')
filtered_df['playerID'] = filtered_df['playerID'].astype('string')

# Save filtered DataFrame
filtered_df.to_csv('Award_Batting_HOF_FE_csv.csv', index=False)
filtered_df.to_feather('Award_Batting_HOF_FE.feather')

filtered_df.info()

Original dataset size: 18724
Filtered dataset size: 8146
<class 'pandas.core.frame.DataFrame'>
Index: 8146 entries, 1 to 18722
Data columns (total 29 columns):
 #   Column           Non-Null Count  Dtype   
---  ------           --------------  -----   
 0   playerID         8146 non-null   string  
 1   years_played     8146 non-null   int64   
 2   total_games      8146 non-null   float64 
 3   total_at_bats    8146 non-null   float64 
 4   total_runs       8146 non-null   float64 
 5   total_hits       8146 non-null   float64 
 6   total_walks      8146 non-null   float64 
 7   total_doubles    8146 non-null   float64 
 8   total_triples    8146 non-null   float64 
 9   total_home_runs  8146 non-null   float64 
 10  total_RBI        8146 non-null   float64 
 11  total_SB         8146 non-null   float64 
 12  total_CS         8146 non-null   float64 
 13  total_BB         8146 non-null   float64 
 14  total_SO         8146 non-null   float64 
 15  total_IBB        8146 non-null   flo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['HOF_status'] = filtered_df['HOF_status'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df['playerID'] = filtered_df['playerID'].astype('string')


In [25]:
grouped_df.head()

Unnamed: 0,playerID,years_played,total_games,total_at_bats,total_runs,total_hits,total_walks,total_doubles,total_triples,total_home_runs,...,total_GIDP,max_HR,max_hits,max_SB,HOF_status,total_singles,batting_avg,OBP,SLG,OPS
0,aardsda01,9,331.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0
1,aaronha01,23,3298.0,12364.0,2174.0,3771.0,1402.0,624.0,98.0,755.0,...,328.0,47.0,223.0,31.0,True,2294.0,0.304998,0.375781,0.435053,0.810834
2,aaronto01,7,437.0,944.0,102.0,216.0,86.0,42.0,6.0,13.0,...,36.0,8.0,77.0,6.0,False,155.0,0.228814,0.293204,0.262712,0.555916
3,aasedo01,13,448.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,False,0.0,0.0,0.0,0.0,0.0
4,abadan01,3,15.0,21.0,1.0,2.0,4.0,0.0,0.0,0.0,...,1.0,0.0,2.0,0.0,False,2.0,0.095238,0.24,0.095238,0.335238


In [26]:
grouped_df.describe()

Unnamed: 0,years_played,total_games,total_at_bats,total_runs,total_hits,total_walks,total_doubles,total_triples,total_home_runs,total_RBI,...,total_SF,total_GIDP,max_HR,max_hits,max_SB,total_singles,batting_avg,OBP,SLG,OPS
count,18724.0,18724.0,18724.0,18724.0,18724.0,18724.0,18724.0,18724.0,18724.0,18724.0,...,18724.0,18724.0,18724.0,18724.0,18724.0,18724.0,18724.0,18724.0,18724.0,18724.0
mean,5.014847,292.557306,821.013726,110.23339,216.627857,76.078883,36.578936,7.77382,16.192373,99.21128,...,3.789468,12.489746,3.299829,39.441198,4.134426,156.082728,0.170208,0.215444,0.190744,0.403892
std,4.617795,527.807546,1891.249653,291.755375,544.659428,207.432638,95.414644,25.12887,57.666437,270.931057,...,12.846341,36.53784,7.419488,58.048645,10.283106,386.701877,0.127198,0.148602,0.146794,0.284802
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1.0,13.0,6.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,1.0,0.066616,0.117647,0.066667,0.197479
50%,3.0,77.0,71.0,6.0,12.0,4.0,2.0,0.0,0.0,4.0,...,0.0,0.0,0.0,8.0,0.0,10.0,0.198547,0.25,0.211921,0.460526
75%,7.0,327.0,605.0,63.0,133.0,40.0,21.0,4.0,4.0,54.0,...,0.0,4.0,2.0,58.0,3.0,101.0,0.251243,0.311938,0.286944,0.598417
max,27.0,5700.0,21752.0,3992.0,7028.0,3416.0,1584.0,618.0,1146.0,4152.0,...,236.0,630.0,73.0,262.0,138.0,5286.0,1.0,1.0,2.0,3.0
