# Combined Functions

### Imports

In [1]:
import pandas as pd
import slippi as slp
from collections import defaultdict
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import seaborn as sns
import numpy as np
from matplotlib import cm
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import IsolationForest
import matplotlib.patches as patches
import statistics
from sklearn.preprocessing import OneHotEncoder
import statistics
import os
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix,recall_score  
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV,cross_val_score
from sklearn.compose import ColumnTransformer
import scipy.stats
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model

### Maps Dictionary Defined

In [None]:
maps = {
    "YOSHIS_STORY": {
        'death_boundary': patches.Rectangle((-175.70, -91.00), 349.3, 259.00, fill=False, color='red'),
        'main_stage':[(-56.000, 0), (56.000, 0)],
        'top_platform':[(-15.75, 42.00), (15.75, 42.00)], 
        'left_platform':[(-59.50, 23.45),(-28.00, 23.45)],
        'right_platform':[(59.50, 23.45), (28.00, 23.45)],
        'left_ledge':[(-57.000,0),(-57,-5)],
        'right_ledge':[(57.000,0),(57,-5)],
    },
    "FINAL_DESTINATION": {
        'death_boundary': patches.Rectangle((-246.00, -140.00), 492.00, 328.00, fill=False, color='red'),
        'main_stage': [(-85.5606, 0), (85.5606, 0)],
        'left_ledge':[(-86.000,0),(-86,-5)],
        'right_ledge':[(86.000,0),(86,-5)]
    },
    "FOUNTAIN_OF_DREAMS": {
        'death_boundary': patches.Rectangle((-198.75, -146.25), 397.50, 348.75, fill=False, color='red'),
        'main_stage': [( -51.2608, 0), (51.2608, 0)],
        'top_platform': [(-14.25, 42.75), (14.25, 42.75)],
        'left_platform':[(-49.50, 16.125),(-21.00, 16.125)], 
        'right_platform':[(21.00, 22.125),(49.50, 22.125)],
        'left_ledge':[(-52.000,0),(-52,-5)],
        'right_ledge':[(52.000,0),(52,-5)],
    },
    "DREAM_LAND_N64": {
        'death_boundary': patches.Rectangle((-255.00, -123.00), 510.00, 373.00, fill=False, color='red'),
        'main_stage': [(-76.3364, 0), (76.3364, 0)],
        'left_platform': [(-61.3896, 30.1422),(-31.7215, 30.1422)],
        'top_platform': [(-19.0188, 51.4264), (19.0188, 51.4264)],
        'right_platform':[(31.7051, 30.2425),(63.0764, 30.2425)] ,
        'left_ledge':[(-77.000,0),(-77,-5)],
        'right_ledge':[(77.000,0),(77,-5)]
    },
    "BATTLEFIELD": {
        'death_boundary': patches.Rectangle((-224, -108.8), 400, 308, fill=False, color='red'),
        'main_stage': [(-68.4000, 0.0000), (68.4, 0.0000)],
        'left_platform': [(-57.60, 27.20),(-20.00, 27.20)],
        'right_platform': [(20.00, 27.20),(57.60, 27.20)],
        'top_platform': [(-18.80, 54.40), (18.80, 54.40)],
        'left_ledge': [(-69,0),(-69,-5)],
        'right_ledge': [(69,0),(69,-5)]
    }
}

In [None]:
# Define the map boundaries
map_bounds = {
    'YOSHIS_STORY': {'left_x': -175.7, 'right_x': 173.6, 'upper_y': 168, 'lower_y': -90},
    'FINAL_DESTINATION': {'left_x': -246, 'right_x': 246, 'upper_y': 188, 'lower_y': -140},
    'FOUNTAIN_OF_DREAMS': {'left_x': -198.75, 'right_x': 198.75, 'upper_y': 202.5, 'lower_y': -146.25},
    'DREAM_LAND_N64': {'left_x': -255, 'right_x': 255, 'upper_y': 250, 'lower_y': -123},
    'BATTLEFIELD': {'left_x': -224, 'right_x': 224, 'upper_y': 200, 'lower_y': -108.8},
    'POKEMON_STADIUM': {'left_x': -230, 'right_x': 230, 'upper_y': 180, 'lower_y': -111}
}

## Point-to-line function

In [None]:
def point_to_line_dist(point, line):
    """Calculate the distance between a point and a line segment.

    Args:
        point (tuple): The x, y coordinates of the point.
        line (list): The x, y coordinates of the endpoints of the line segment.

    Returns:
        float: The shortest distance from the point to the line segment.
    """
    line = np.array(line)
    point = np.array(point)
    line_diff = np.diff(line, axis=0).flatten()  # Flatten the array here
    line_length = np.sqrt((line_diff**2).sum())
    if line_length > 0:
        t = max(0, min(1, np.dot(point - line[0], line_diff) / line_length**2))
        projection = line[0] + t * line_diff
        return np.sqrt(((point - projection)**2).sum())
    else:
        return np.sqrt(((point - line[0])**2).sum())



## Classify positions function

In [None]:
def classify_positions(df, map_name, threshold=5):
    # Get the map data
    map_data = maps[map_name]

    labels = {}

    # For each platform in the map data
    for platform_name, platform_data in map_data.items():
        if platform_name not in ['death_boundary', 'main_stage']:
            # Calculate the distance from each point to the line
            distances = df.apply(lambda row: point_to_line_dist((row['x'], row['y']), platform_data), axis=1)
            # Get the points within the threshold
            close_points = df[distances < threshold]
            # Add the points to the corresponding line
            labels[platform_name] = close_points.index.tolist()

    df['label'] = None  # Set default label

    # Assign labels 
    for label, points in labels.items():
        df.loc[points, 'label'] = label

    # Drop the points with no label
    df = df.dropna(subset=['label'])

    # Count the number of points for each label
    label_counts = df['label'].value_counts()

    # Get the label with the most points
    most_points_label = label_counts.idxmax()

    return most_points_label

## Extract post data function

In [None]:
def extract_post_data(game):
    post_data = defaultdict(list)
    for frame in game.frames:
        for port in frame.ports:
            if port:
                post = port.leader.post
                character = post.character.name
                position = post.position
                direction = post.direction.name
                damage = post.damage
                shield = post.shield
                stocks = post.stocks
                last_attack_landed = post.last_attack_landed.name if post.last_attack_landed else None
                last_hit_by = post.last_hit_by
                combo_count = post.combo_count
                state_age = post.state_age
                flags = post.flags
                hit_stun = post.hit_stun
                airborne = post.airborne
                ground = post.ground
                jumps = post.jumps
                l_cancel = post.l_cancel.name if post.l_cancel else None
                post_data['Character'].append(character)
                post_data['Position'].append(position)
                post_data['Direction'].append(direction)
                post_data['Damage'].append(damage)
                post_data['Shield'].append(shield)
                post_data['Stocks'].append(stocks)
                post_data['Last Attack Landed'].append(last_attack_landed)
                post_data['Last Hit By'].append(last_hit_by)
                post_data['Combo Count'].append(combo_count)
                post_data['State Age'].append(state_age)
                post_data['Flags'].append(flags)
                post_data['Hit Stun'].append(hit_stun)
                post_data['Airborne'].append(airborne)
                post_data['Ground'].append(ground)
                post_data['Jumps'].append(jumps)
                post_data['L Cancel'].append(l_cancel)
    
    df = pd.DataFrame(post_data)
    return df

## Process data Function

In [None]:
def processData(gameFilePath):
    try:
        # games should be a list of filepaths perta
        # Take in a game replay
        # Pull out the character, and all significant data
        # Convert the data into two dataframe rows
        # Return this dataframe to be concatenated
        game_set_df = pd.DataFrame()
        # do wrapping here to define game
        game = slp.Game(gameFilePath)
        real_players = [i for i, value in enumerate(game.start.players) if value is not None]
        if (game.end.lras_initiator == None and (game.start.players[real_players[0]].type == 0 and game.start.players[real_players[1]].type == 0)and game.start.stage.name!='POKEMON_STADIUM'):
            fight_df = extract_post_data(game)
            #---------------------#
            # Get Character Names
            characters = list(set(fight_df['Character']))
            if (len(characters) == 1):
                characters.append(characters[0])
            if ('ZELDA' in characters):
                print(f"Zelda in: {gameFilePath}")
            if (len(characters) > 2):
                return None

            #---------------------#
            # Get Stage
            game_stage = game.start.stage.name
            #---------------------#
            # Get L-Cancel Rate
            # Extract char_1_df (rows 0, 2, 4, 6, ...)
            char_1_df = fight_df.iloc[::2]
            # Extract char_2_df (rows 1, 3, 5, 7, ...)
            char_2_df = fight_df.iloc[1::2]
            
            # Char 1
            number_of_success_1 = (char_1_df['L Cancel'] == 'SUCCESS').sum()
            number_of_fails_1 = (char_1_df['L Cancel'] == 'FAILURE').sum()

            # Char 2
            number_of_success_2 = (char_2_df['L Cancel'] == 'SUCCESS').sum()
            number_of_fails_2 = (char_2_df['L Cancel'] == 'FAILURE').sum()

            #---------------------#
            # Find Winner
            char_1_win = 0
            char_2_win = 0
            if (len(char_1_df['Stocks'].value_counts()) == 5 and len(char_2_df['Stocks'].value_counts()) != 5):
                char_1_win = 0
                char_2_win = 1
            elif (len(char_2_df['Stocks'].value_counts()) == 5 and len(char_1_df['Stocks'].value_counts()) != 5):
                char_1_win = 1
                char_2_win = 0
            else:
                return None
            #---------------------#
            # Max combo count
            max_combo_1 = max(char_1_df['Combo Count'])
            max_combo_2 = max(char_2_df['Combo Count'])
            #---------------------#
            # Unpack the positions into two lists
            position_list = fight_df['Position'][::2].tolist()
            position_list_2 = fight_df['Position'][1::2].tolist()

            x = []
            y = []
            x_2 = []
            y_2 = []

            for position in position_list:
                x.append(position.x)
                y.append(position.y)
            for position in position_list_2:
                x_2.append(position.x)
                y_2.append(position.y)

            # Create DataFrame for player1
            player1_data = {'x': x, 'y': y}
            player1_df = pd.DataFrame(player1_data)

            # Create DataFrame for player2
            player2_data = {'x': x_2, 'y': y_2}
            player2_df = pd.DataFrame(player2_data)
            #---------------------#
            pos_1 = classify_positions(player1_df,game_stage)
            pos_2 = classify_positions(player2_df,game_stage)
            #print(f"Most used platform for {characters[0]} is: {pos_1}")
            #print(f"Most used platform for {characters[1]} is: {pos_2}")

            #---------------------#
            # Dan, Average X
            #---------------------#
            standard_center_dist_0 = []
            standard_center_dist_1 = []
            center = 0
            stage_bounds = map_bounds[game_stage]
            mean_center_dist_0 = -1 # for chatacter 0
            mean_center_dist_1 = -1 # for chatacter 1

            # this finds the true center of the map in the x direction, in the case that
            # the left and right bounds are not equal.
            if abs(stage_bounds['left_x']) != abs(stage_bounds['right_x']):
                center = (stage_bounds['left_x'] + stage_bounds['right_x'])/2.0

            #go through each frame of the fight
            for index in fight_df.index:
                #compute the distances from the center
                pos_string = str(fight_df['Position'].loc[index])[1:-1] # get the pos string
                pos_x = float(pos_string[:pos_string.index(",")]) # get the X coord from the string
                center_distance = abs(pos_x + center) # works. if left is larger, adds, else subtracts 

                #standardize the distances
                #since the left/right direction of the stage does not matter, 
                # each value will only need to be standerdized by half the stange width
                half_stage_x_size = (abs(stage_bounds['right_x']) + abs(stage_bounds['left_x']))/2
                standard_center_distance = center_distance/half_stage_x_size
                if index%2==0:
                    standard_center_dist_1.append(standard_center_distance)
                else:
                    standard_center_dist_0.append(standard_center_distance)
            mean_center_dist_0 = statistics.mean(standard_center_dist_0)
            mean_center_dist_1 = statistics.mode(standard_center_dist_1)

            data = [
                {'Character': characters[0], 'Win': char_1_win, 'Stage': game_stage, "L-Cancel Successes" : number_of_success_1, "L-Cancel Failures" : number_of_fails_1, "Max Combo" : max_combo_1, "Mean X Distance" : mean_center_dist_0,"Platform" : pos_1},
                {'Character': characters[1], 'Win': char_2_win, 'Stage': game_stage, "L-Cancel Successes" : number_of_success_2, "L-Cancel Failures" : number_of_fails_2, "Max Combo" : max_combo_2, "Mean X Distance" : mean_center_dist_1,"Platform": pos_2}
            ]
            game_set_df = pd.DataFrame(data)
        else:
            return None
    except:
        return None

    return game_set_df

# Start

In [None]:
#Get the directory of games
dir_replays = 'Games/'

#List fights used to iterate through each fight within the Games
fights = []

#Add each fight to fights
for path in os.listdir(dir_replays):
    full = os.path.join(dir_replays, path)
    if os.path.isfile(full):
        fights.append(full)

In [None]:
all_game_df = pd.DataFrame()
bad_games = 0
for fight in fights:
    fight_df = processData(fight)
    if fight_df is not None:
        all_game_df = pd.concat([all_game_df, fight_df], ignore_index=True)
    else:
        bad_games += 1

all_game_df

Zelda in: Games/Game_20191207T190030.slp
Zelda in: Games/Game_20191208T104406.slp
Zelda in: Games/Game_8C56C529AEAA_20230519T112050.slp
Zelda in: Games/Game_8C56C529AEAA_20230519T112505.slp
Zelda in: Games/Game_8C56C529AEAA_20230519T125301.slp
Zelda in: Games/Game_8C56C529AEAA_20230519T125625.slp
Zelda in: Games/Game_8C56C529AEAA_20230519T133746.slp
Zelda in: Games/Game_8C56C529AEAA_20230519T140445.slp
Zelda in: Games/Game_8C56C529AEAA_20230519T162227.slp
Zelda in: Games/Game_8C56C529AEAA_20230519T202157.slp
Zelda in: Games/Game_8C56C529AEAA_20230520T114811.slp
Zelda in: Games/Game_8C56C529AEAA_20230520T170718.slp
Zelda in: Games/Game_8C56C529AEAA_20230521T184530.slp
Zelda in: Games/Game_8C56C529AEAA_20230521T193929.slp


Unnamed: 0,Character,Win,Stage,L-Cancel Successes,L-Cancel Failures,Max Combo,Mean X Distance,Platform
0,FALCO,1,BATTLEFIELD,42,11,2,0.197848,top_platform
1,MARTH,0,BATTLEFIELD,20,6,1,0.315179,right_platform
2,FALCO,0,FOUNTAIN_OF_DREAMS,29,19,2,0.185873,left_platform
3,MARTH,1,FOUNTAIN_OF_DREAMS,24,1,2,0.000000,left_ledge
4,FALCO,0,YOSHIS_STORY,27,9,1,0.154211,left_platform
...,...,...,...,...,...,...,...,...
653,FOX,0,BATTLEFIELD,40,2,2,0.057143,left_platform
654,YOSHI,1,FINAL_DESTINATION,30,4,4,0.178601,right_ledge
655,FOX,0,FINAL_DESTINATION,59,4,1,0.065041,left_ledge
656,YOSHI,1,FINAL_DESTINATION,29,7,4,0.186423,right_ledge


# Random forest modeling

In [None]:
rf_df = all_game_df 
#NEEDED, permutation of platform hurt the model
rf_df = rf_df.drop(columns=['Platform'])
character_encoded = pd.get_dummies(rf_df['Character'], prefix='Character')
stage_encoded = pd.get_dummies(rf_df['Stage'], prefix='Stage')
rf_df = pd.concat([all_game_df, character_encoded,stage_encoded], axis=1)
rf_df = all_game_df.drop(['Character', 'Stage'], axis=1)

In [None]:
X = rf_df.drop(columns=['Win'])
y = rf_df['Win']

In [None]:
#Hyperparam tuning
cv = StratifiedKFold(n_splits=5, shuffle=True,random_state=8)
model = RandomForestClassifier()
space = {'n_estimators': list(range(10, 101, 10)), 'criterion': ['gini', 'entropy'], 'max_depth': list(range(5, 21, 1)), 'min_samples_split': list(range(2, 51, 1)), 'min_samples_leaf': list(range(1, 51, 1)), 'max_features': ['sqrt', 'log2', None], 'max_leaf_nodes': list(range(5, 51, 1))}
search = RandomizedSearchCV(model, space, n_iter=500, scoring='accuracy', random_state=1, cv=cv)
search.fit(X,y)

In [None]:
optimized_params = search.best_params_
RF = RandomForestClassifier(**optimized_params)
RF.fit(X, y)

In [None]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
# store performance metrics
accuracy_scores = []
recall_scores = []
# Initialize a zero matrix for storing sum of all confusion matrices
sum_conf_matrix = np.zeros((2, 2))  # Binary classification
# Iterate over folds
for train_idx, val_idx in cv.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
 
    RF.fit(X_train, y_train) # training
 
    #  model eval
    accuracy = RF.score(X_val, y_val)
    accuracy_scores.append(accuracy)
    y_pred_val = RF.predict(X_val)
    recall = recall_score(y_val, y_pred_val)
    recall_scores.append(recall)
    # Compute confusion matrix for this fold and add to the sum
    conf_matrix = confusion_matrix(y_val, y_pred_val)
    sum_conf_matrix += conf_matrix
# calculate average accuracy
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Average Accuracy: {average_accuracy:.4f}")
average_recall = sum(recall_scores) / len(recall_scores)
print(f"Average Recall (Sensitivity): {average_recall:.4f}")

### Functions

In [None]:
def adjust_r(r2, obs, feats):
    # print("observations " + str(obs))
    # print("number of features " + str(feats))
    return 1-(((1-r2)*(obs-1))/(obs-feats-1))

# LOGIT Model

In [None]:
all_games = pd.read_csv("AllGames.csv")

# Determine any correletions
if the data is strongly correlated, then I won't want to include both as variables in my initial model

In [None]:
categorical_features = ["Character","Stage","Platform"]
for feature in categorical_features:
    all_games[feature + "_factorized"] = pd.factorize(all_games[feature])[0]

## FOR AllGames
the mean X distance values for evenly numbered games are mostly outliers, since normally a game one is played closer to 0, not further. Use the odd indicies
## FOR TheoGames
use odd too...? So most winners are odd...?

In [None]:
half_games=all_games.iloc[1::2].reset_index(drop=True)
# sns.countplot(data=half_games, x="Win")
sns.boxplot(x="Win", y="Mean X Distance", palette="muted", data=half_games)

In [None]:
columns = ["L-Cancel Successes","L-Cancel Failures","Max Combo","Mean X Distance","Character_factorized","Stage_factorized","Platform_factorized"]
# palette0 = 
sns.pairplot(all_games.iloc[1::2], hue="Win")
# sns.scatterplot(data=all_games[columns], x="L-Cancel Successes", y="Mean X Distance")

### all_games
There does not seem to be any correlation, which is good, but there also seems to be no clear clustering. Likely a model will have trouble predicting. It looks like the mean distance has the "best" sepereation, but even it's seperation is quite bad. A model will probably have alot of trouble predicting
# Hypothesis testing

In [None]:
half_games = all_games.iloc[1::2]
half_games.reset_index(drop=True)

In [None]:
hypothesis_tests = pd.DataFrame(columns=columns)
for variable in columns:
    # if(counties[variable].dtype == "float64"): # for some reason only some of the float64 values have null values
    #     checkna = counties[variable].isna()
    #     print("Na values: "+ str(checkna[checkna == True].shape[0]) + "/" + str(counties[variable].shape[0]))
    #     print("filling with mean")
    #     counties[variable] = counties[variable].fillna(counties[variable].mean())
    result = scipy.stats.linregress(half_games['Win'], half_games[variable])
    p = result[3]
    stat = result[2]
    # print("p: " + str(p))
    # print("stat: " + str(stat))
    hypothesis_tests.loc["p-value", [variable]] = float(p)
    hypothesis_tests.loc["test_statistic", [variable]] = stat
    hypothesis_tests.loc["statistically_significant", [variable]] = (p <= 0.05) # then is signicicant: reject the null hypothesis and accept the alternative hypothesis.
hypothesis_tests = hypothesis_tests.transpose()
hypothesis_tests.head(100)
# scipy.stats.linregress(counties['life-expectancy'], counties['avg income'])

### all_games
The results of hypothesis testing are not suprising. There shouldn't be many clear predictors in the data set since none were found when looking for correlations and clusters. 
# Model creation

In [None]:
scaler = preprocessing.StandardScaler().fit(all_games[["L-Cancel Successes","L-Cancel Failures","Max Combo"]])
scaled_features = scaler.transform(all_games[["L-Cancel Successes","L-Cancel Failures","Max Combo"]])
scaled_df = pd.DataFrame(scaled_features)
scaled_df.columns = ["L-Cancel Successes","L-Cancel Failures","Max Combo"]
for feature in categorical_features:
    scaled_df[feature] = all_games[feature + "_factorized"]
scaled_df["Win"] = all_games["Win"]
scaled_df["Mean X Distance"] = all_games["Mean X Distance"]
scaled_df["constant"] = 0
scaled_df
#so then here would be the k-fold CV code. I don't think it's nessicary to include that here

In [None]:
scaled_df = scaled_df.iloc[1::2]
# X = scaled_df.drop(columns=['Win'])
X = scaled_df[["Mean X Distance","L-Cancel Failures"]] # allgames
# X = scaled_df[["Mean X Distance","Max Combo"]] # theo
y = scaled_df['Win']

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model = LogisticRegression()  # Replace with whatever model

# store performance metrics
accuracy_scores = []
recall_scores = []

# Iterate over folds
for train_idx, val_idx in kf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train) # training

    #  model eval
    accuracy = model.score(X_val, y_val)
    accuracy_scores.append(accuracy)
    y_pred_val = model.predict(X_val)
    recall = recall_score(y_val, y_pred_val)
    recall_scores.append(recall)


# calculate average accuracy
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Average Accuracy: {average_accuracy:.4f}")
# average_recall = sum(recall_scores) / len(recall_scores)
# print(f"Average Recall (Sensitivity): {average_recall:.4f}")

### all_games
The accuracy is, unsuprizingly, not very good... compared to the iris data set. However based on our clustering, this is very good. If it is able to correlty predict the outcome of a game more that 50% of the time, based on data that has no clear seperation, is very good. Recall isn't quite as good. Looks like our model is pessimistic. However, given the data, I'm still happy with this
## Greedy

In [None]:
reg = linear_model.LinearRegression(fit_intercept=False)

In [None]:
features = ['L-Cancel Successes', 'L-Cancel Failures', 'Max Combo', 'Character',
       'Stage', 'Platform', 'Mean X Distance'] #all possible features
all_models = []
for feature in features:
    data = {}
    data["feature"] = feature
    data["model"] = reg.fit(scaled_df[feature].array.reshape(-1, 1),scaled_df["Win"])
    data["X"] = scaled_df[feature].array.reshape(-1, 1)
    data["Y"] = scaled_df["Win"]
    r = data["model"].score(data["X"],data["Y"])
    data["r"] =  adjust_r(r, scaled_df[feature].size,1)
    all_models.append(data)
all_models.sort(key=lambda x:x['r'], reverse=True) #sorted from highest to lowest adjusted r^2

In [None]:
X = scaled_df["constant"].array.reshape(-1, 1)
model3 = reg.fit(X,y)
greedy_feat = ["constant"]
old_r = adjust_r(reg.score(scaled_df["constant"].array.reshape(-1, 1),y), scaled_df[greedy_feat].size,1)
print('initial adj r2:',old_r)
for data in all_models: #go in order of sorted models
    greedy_feat.append(data["feature"])
    model3 = reg.fit(scaled_df[greedy_feat].to_numpy(),y)
    r = reg.score(scaled_df[greedy_feat].to_numpy(),y)
    shape = scaled_df[greedy_feat].shape
# #     new_r = adjust_r(r, shape[1], shape[0])
    new_r = adjust_r(r, shape[0], shape[1])
    print(greedy_feat,old_r,new_r,shape)
    if (new_r > old_r):
        old_r = new_r
        print(data["feature"] + ": " + str(r))
    else:
        del greedy_feat[-1]
        print(data["feature"] + " no increase")
# model3 = reg.fit(scaled_df[greedy_feat].to_numpy(),y)
# model3_pred = reg.predict(scaled_df[greedy_feat].to_numpy())

In [None]:
# X = scaled_df.drop(columns=['Win'])
X = scaled_df[greedy_feat]
y = scaled_df['Win']

kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

model = LogisticRegression()  # Replace with whatever model

# store performance metrics
accuracy_scores = []
recall_scores = []

# Iterate over folds
for train_idx, val_idx in kf.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    model.fit(X_train, y_train) # training

    #  model eval
    accuracy = model.score(X_val, y_val)
    accuracy_scores.append(accuracy)
    y_pred_val = model.predict(X_val)
    recall = recall_score(y_val, y_pred_val)
    recall_scores.append(recall)


# calculate average accuracy
average_accuracy = sum(accuracy_scores) / len(accuracy_scores)
print(f"Average Accuracy: {average_accuracy:.4f}")
# average_recall = sum(recall_scores) / len(recall_scores)
# print(f"Average Recall (Sensitivity): {average_recall:.4f}")

In [None]:
print(greedy_feat)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0c89de8a-b7c8-41f0-8e73-e99755abdc67' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>