## Prediction for Returning Date

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
file_path = './data/frauen_bundesliga_player_injury_history.csv'
df = pd.read_csv(file_path)

print(df.head())

           Player Name          Team Name  Age   Height Position      Value  \
0  1 Maria Luisa Grohs  FC Bayern München   23     1,80      Tor  125.000 €   
1  1 Maria Luisa Grohs  FC Bayern München   23     1,80      Tor  125.000 €   
2    32 Ena Mahmutovic  FC Bayern München   21     1,77      Tor   90.000 €   
3     41 Anna Wellmann  FC Bayern München   29     1,75      Tor   20.000 €   
4     1 Juliane Schmid  FC Bayern München   20  Unknown      Tor  unbekannt   

  Saison         von         bis          Verletzung  \
0  24/25  16.11.2024         NaN               Tumor   
1  24/25  21.10.2024  07.11.2024           Operation   
2  24/25  09.08.2024  12.09.2024       Fußverletzung   
3  24/25  09.08.2024  11.08.2024  muskuläre Probleme   
4  00/00  00.00.0000  00.00.0000      No injury info   

                                          Player URL  
0  https://www.soccerdonna.de/de/maria-luisa-groh...  
1  https://www.soccerdonna.de/de/maria-luisa-groh...  
2  https://www.soccerdo

In [5]:
if 'Height' in df.columns:
    df['Height'] = df['Height'].astype(str).str.replace(',', '.').replace('Unknown', np.nan)
    df['Height'] = pd.to_numeric(df['Height'], errors='coerce')  
    
if 'Value' in df.columns:
    df['Value'] = df['Value'].str.replace('€', '').str.replace('.', '').str.strip()
    df['Value'] = pd.to_numeric(df['Value'], errors='coerce')

df = df.drop(df.columns[-1], axis=1)

if 'von' in df.columns:
    df['von'] = pd.to_datetime(df['von'], errors='coerce', format='%d.%m.%Y')
if 'bis' in df.columns:
    df['bis'] = pd.to_datetime(df['bis'], errors='coerce', format='%d.%m.%Y')

if 'von' in df.columns and 'bis' in df.columns:
    df['Injury Duration'] = (df['bis'] - df['von']).dt.days

df['Injury Duration'] = df['Injury Duration'].fillna(0)

print(df.head())

           Player Name          Team Name  Age  Height Position     Value  \
0  1 Maria Luisa Grohs  FC Bayern München   23    1.80      Tor  125000.0   
1  1 Maria Luisa Grohs  FC Bayern München   23    1.80      Tor  125000.0   
2    32 Ena Mahmutovic  FC Bayern München   21    1.77      Tor   90000.0   
3     41 Anna Wellmann  FC Bayern München   29    1.75      Tor   20000.0   
4     1 Juliane Schmid  FC Bayern München   20     NaN      Tor       NaN   

  Saison        von        bis          Verletzung  Injury Duration  
0  24/25 2024-11-16        NaT               Tumor              0.0  
1  24/25 2024-10-21 2024-11-07           Operation             17.0  
2  24/25 2024-08-09 2024-09-12       Fußverletzung             34.0  
3  24/25 2024-08-09 2024-08-11  muskuläre Probleme              2.0  
4  00/00        NaT        NaT      No injury info              0.0  


In [6]:
prediction_df = df.copy()

# 计算伤病持续时间
prediction_df['Injury Duration for Prediction'] = (prediction_df['bis'] - prediction_df['von']).dt.days

# 删除无法计算持续时间的无效记录
prediction_df = prediction_df[prediction_df['von'].notna()]

# 将缺失的伤病持续时间记录作为待预测目标
prediction_df_to_predict = prediction_df[prediction_df['Injury Duration for Prediction'].isna()]
prediction_df_train = prediction_df[prediction_df['Injury Duration for Prediction'].notna()]

# 取前20种伤病
top_20_injuries = prediction_df_train['Verletzung'].value_counts().head(20).index
print(top_20_injuries)
prediction_df_train = prediction_df_train[prediction_df_train['Verletzung'].isin(top_20_injuries)]

# 对未康复的球员数据进行过滤，只保留前20种伤病类型
prediction_df_to_predict['Prediction Status'] = np.where(
    prediction_df_to_predict['Verletzung'].isin(top_20_injuries), "Predictable", "Unknown"
)
prediction_df_to_predict_filtered = prediction_df_to_predict[prediction_df_to_predict['Prediction Status'] == "Predictable"]

print(f"训练集大小: {prediction_df_train.shape}, 待预测集大小: {prediction_df_to_predict_filtered.shape}")

Index(['Kreuzbandriss', 'Corona-Virus', 'muskuläre Probleme',
       'unbekannte Verletzung', 'Knieverletzung', 'Knieprobleme',
       'Sprunggelenksverletzung', 'krank', 'Schulterverletzung',
       'Fußverletzung', 'Muskelverletzung', 'Syndesmosebandriss',
       'Muskelfaserriss', 'Erkältung', 'Rückenprobleme', 'Knie-OP',
       'Oberschenkelverletzung', 'Mittelfußbruch', 'grippaler Infekt',
       'Meniskusverletzung'],
      dtype='object', name='Verletzung')
训练集大小: (239, 12), 待预测集大小: (25, 13)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df_to_predict['Prediction Status'] = np.where(


In [7]:
# 定义特征与目标
features = ['Age', 'Height', 'Position', 'Verletzung', 'Team Name']
target = 'Injury Duration for Prediction'

X = prediction_df_train[features]
y = prediction_df_train[target]

X_to_predict = prediction_df_to_predict_filtered[features]

# 处理缺失值（将身高中 'Unknown' 替换为中位数）
X['Height'] = X['Height'].replace('Unknown', np.nan).astype(float)
X['Height'].fillna(X['Height'].median(), inplace=True)
X_to_predict['Height'] = X_to_predict['Height'].replace('Unknown', np.nan).astype(float)
X_to_predict['Height'].fillna(X['Height'].median(), inplace=True)

# 类别特征编码
categorical_features = ['Position', 'Verletzung', 'Team Name']
numeric_features = ['Age', 'Height']

preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ]
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Height'] = X['Height'].replace('Unknown', np.nan).astype(float)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X['Height'].fillna(X['Height'].median(), inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['Height'].fillna(X[

In [8]:
# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 构建随机森林回归模型
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# 模型训练
model.fit(X_train, y_train)

# 模型评估
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE) on test data: {mse}")
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE) on test data: {rmse} days")

Mean Squared Error (MSE) on test data: 2183.519870716435
Root Mean Squared Error (RMSE) on test data: 46.728148590720295 days


Current models use limited characteristics (e.g., age, position, height, etc.). 

Other factors may influence injury prediction (e.g., historical injury record, medical team's recovery plan, etc.).

Need more data.

In [9]:
# 对缺失的伤病持续时间进行预测
prediction_df_to_predict_filtered['Predicted Injury Duration'] = model.predict(X_to_predict)

prediction_df_to_predict_filtered['Predicted Recovery Date'] = (prediction_df_to_predict_filtered['von'] + pd.to_timedelta(prediction_df_to_predict_filtered['Predicted Injury Duration'], unit='D')).dt.floor('D')

print("Predicted Recovery Date for Records Missing Recovery Time:")
display(prediction_df_to_predict_filtered[['Player Name', 'Verletzung', 'Position', 'Age', 'von', 'Predicted Injury Duration', 'Predicted Recovery Date']])


Predicted Recovery Date for Records Missing Recovery Time:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df_to_predict_filtered['Predicted Injury Duration'] = model.predict(X_to_predict)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction_df_to_predict_filtered['Predicted Recovery Date'] = (prediction_df_to_predict_filtered['von'] + pd.to_timedelta(prediction_df_to_predict_filtered['Predicted Injury Duration'], unit='D')).dt.floor('D')


Unnamed: 0,Player Name,Verletzung,Position,Age,von,Predicted Injury Duration,Predicted Recovery Date
17,13 Tainara,krank,Abwehr - Innenverteidigung,25,2024-01-01,16.751667,2024-01-17
21,19 Katharina Naschenweng,Kreuzbandriss,Abwehr - linke Verteidigung,27,2024-06-03,288.706667,2025-03-18
31,8 Lena Oberdorf,Kreuzbandriss,Mittelfeld - defensives Mittelfeld,23,2024-07-16,259.24,2025-04-01
109,16 Camilla Küver,Knieprobleme,Abwehr - Innenverteidigung,21,2024-08-05,39.576667,2024-09-13
140,17 Kristin Demann,Knieprobleme,Mittelfeld - defensives Mittelfeld,31,2024-08-01,20.203333,2024-08-21
186,29 Dilara Açikgöz,Kreuzbandriss,Abwehr,20,2024-07-07,250.836,2025-03-14
193,20 Ilayda Açikgöz,Kreuzbandriss,Mittelfeld - zentrales Mittelfeld,20,2024-05-03,252.156,2025-01-10
195,28 Barbara Dunst,Kreuzbandriss,Mittelfeld,27,2024-12-03,308.662,2025-10-07
237,17 Franziska Harsch,Muskelverletzung,Mittelfeld - zentrales Mittelfeld,27,2024-11-04,84.44631,2025-01-27
275,21 Sofía Cava Marin,Kreuzbandriss,Abwehr - Innenverteidigung,20,2024-10-24,248.023333,2025-06-29
