In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.utils import all_estimators

In [77]:
data = pd.read_csv(r"D:\BIG_DATA\charging_stations\task_2\AllCity.csv")

In [78]:
data.head().T

Unnamed: 0,0,1,2,3,4
Unnamed: 0,0,1,2,3,4
geometry,POLYGON ((494324.36910769687 4140139.900782528...,POLYGON ((494491.37794651155 4140530.022613017...,POLYGON ((494705.80941380473 4141022.185615587...,"POLYGON ((494818.1755447965 4140037.859610446,...",POLYGON ((494818.17554479634 4141263.137357092...
EV_stations_counts,0,0,0,0,0
EV_stations_geomery,[],[],[],[],[]
population,0.0,0.0,0.0,0.0,0.0
landuse,Other,Other,Other,Other,Other
nodes,0,0,0,0,0
edges,0,0,0,0,0
density,0.0,0.0,0.0,0.0,0.0
oneway_exists,No,No,No,No,No


In [79]:
#filtering out columsn to be used for modeling
data.drop(columns=['Unnamed: 0', 'EV_stations_geomery'], inplace=True)

In [80]:
print("data size:" , data.shape)

data size: (8534, 25)


In [81]:
# Fill all NaN values with 0
data.fillna(0, inplace=True)

In [82]:
# Check for missing values in each column
missing_values = data.isna().sum()
print(missing_values)

geometry                  0
EV_stations_counts        0
population                0
landuse                   0
nodes                     0
edges                     0
density                   0
oneway_exists             0
highway_types             0
average_lanes             0
average_maxspeed          0
city                      0
school_count              0
university_count          0
restaurant_count          0
place_of_worship_count    0
community_centre_count    0
townhall_count            0
parking_count             0
library_count             0
park_count                0
commercial_count          0
government_count          0
civic_count               0
retail_count              0
dtype: int64


In [83]:
# Convert string representation of set to actual set and then to list to extract first element
def extract_first_highway(value):
    if pd.isna(value):
        return None
    try:
        # Evaluate the string as a set, convert to list and extract the first item
        evaluated_set = eval(value)
        if isinstance(evaluated_set, set) and evaluated_set:
            return list(evaluated_set)[0]
    except:
        return None

# Apply the function to the 'highway_types' column
data['high_way'] = data['highway_types'].apply(extract_first_highway)

In [84]:
# This code will display the unique values in the 'high_way' column.
unique_values_highway = data['high_way'].unique()
unique_values_highway

array([None, 'motorway', 'motorway_link', 'primary', 'secondary',
       'secondary_link', 'tertiary', 'unclassified', 'residential',
       'trunk', 'trunk_link', 'primary_link', 'tertiary_link'],
      dtype=object)

In [85]:
categorical_columns = ['geometry', 'city', 'landuse', 'oneway_exists', 'highway_types', 'high_way']
data[categorical_columns] = data[categorical_columns].astype(str)

In [86]:
data.dtypes

geometry                   object
EV_stations_counts          int64
population                float64
landuse                    object
nodes                       int64
edges                       int64
density                   float64
oneway_exists              object
highway_types              object
average_lanes             float64
average_maxspeed          float64
city                       object
school_count              float64
university_count          float64
restaurant_count          float64
place_of_worship_count    float64
community_centre_count    float64
townhall_count            float64
parking_count             float64
library_count             float64
park_count                float64
commercial_count          float64
government_count          float64
civic_count               float64
retail_count              float64
high_way                   object
dtype: object

In [87]:
def data_splitter(data, train_cities=None, test_cities=None, test_size=0.2, random_state=23):

    if train_cities is not None:
        train = data[data['city'].isin(train_cities)]
        test = data[data['city'].isin(test_cities)]


        X_train = train.drop(['city','geometry', 'EV_stations_counts', 'highway_types'], axis=1)
        y_train = train['EV_stations_counts'].astype(int)
        y_train = y_train.apply(lambda x: 1 if x > 0 else 0)

        X_test = test.drop(['city','geometry', 'EV_stations_counts', 'highway_types'], axis=1)
        y_test = test['EV_stations_counts'].astype(int)
        y_test = y_test.apply(lambda x: 1 if x > 0 else 0)
    else:
        X = data.drop(['city','geometry', "EV_stations_counts", 'highway_types'], axis=1)
        y = data['EV_stations_counts']
        y = y.apply(lambda x: 1 if x > 0 else 0)
        X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=test_size, random_state=random_state)

    return X_train, X_test, y_train, y_test

In [88]:
X_train, X_test, y_train, y_test = data_splitter(data)

In [89]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Define categorical and numerical features for modelling
categorical_features = ['landuse', 'oneway_exists', 'high_way']
numerical_features = [col for col in X_train.columns if col not in categorical_features]

# Creating a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(), categorical_features)
    ])

# Fit and transform the training data, and transform the test data
# Fitting the preprocessor on the training data and transforming training data
X_train_scaled = preprocessor.fit_transform(X_train)
X_test_scaled = preprocessor.transform(X_test)

In [90]:
# logistic regression
logreg = LogisticRegression()
logreg.fit(X_train_scaled, y_train)
print("Logistic Regression Test Accuracy: ", logreg.score(X_test_scaled, y_test))
# classification report
y_pred = logreg.predict(X_test_scaled)
print(classification_report(y_test, y_pred))

Logistic Regression Test Accuracy:  0.9589923842999414
              precision    recall  f1-score   support

           0       0.96      1.00      0.98      1638
           1       0.47      0.10      0.17        69

    accuracy                           0.96      1707
   macro avg       0.72      0.55      0.57      1707
weighted avg       0.94      0.96      0.95      1707



In [91]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, balanced_accuracy_score
from sklearn.utils import all_estimators
from tqdm import tqdm
import pandas as pd

def run_experiment(X_train, X_test, y_train, y_test):
    # Get all classification model classes
    classifiers = all_estimators(type_filter='classifier')

    # Initialize result table
    results = []
    models = {}
    # Run models and collect results
    for name, ClassifierClass in tqdm(classifiers):
        try:
            # Initialize model
            model = ClassifierClass()
            model.fit(X_train, y_train)
            models[name] = model
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
            recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
            f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
            auc = roc_auc_score(y_test, y_pred, multi_class='ovo')
            balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
            
            # Append results
            results.append([name, accuracy, precision, recall, f1, auc, balanced_accuracy])
        except Exception as e:
            print(f"Error occurred for {name}: {str(e)}")

    # Create a DataFrame from results
    results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score", "AUC", "Balanced Accuracy"])
    results_df = results_df.sort_values(by=['F1-score', 'AUC'], ascending=False)
    return results_df, models

results_df, models = run_experiment(X_train_scaled, X_test_scaled, y_train, y_test)
print(results_df)



Error occurred for CategoricalNB: index 7 is out of bounds for axis 1 with size 7
Error occurred for ClassifierChain: ClassifierChain.__init__() missing 1 required positional argument: 'base_estimator'
Error occurred for ComplementNB: Negative values in data passed to ComplementNB (input X)




Error occurred for FixedThresholdClassifier: FixedThresholdClassifier.__init__() missing 1 required positional argument: 'estimator'


  probabilities /= normalizer
  probabilities /= normalizer


Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for MultinomialNB: Negative values in data passed to MultinomialNB (input X)
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([   4,    6,    9,   17,   23,   27,   32,   33,   34,   37,   41,
         46,   50,   57,   59,   65,   77,   86,   99,  102,  107,  109,
        113,  118,  119,  123,  134,  140,  143,  145,  146,  150,  155,
        159,  164,  166,  167,  170,  173,  182,  185,  19

100%|██████████| 43/43 [01:01<00:00,  1.42s/it]

Error occurred for StackingClassifier: StackingClassifier.__init__() missing 1 required positional argument: 'estimators'
Error occurred for TunedThresholdClassifierCV: TunedThresholdClassifierCV.__init__() missing 1 required positional argument: 'estimator'
Error occurred for VotingClassifier: VotingClassifier.__init__() missing 1 required positional argument: 'estimators'
                             Model  Accuracy  Precision    Recall  F1-score  \
15      LinearDiscriminantAnalysis  0.950791   0.688051  0.703655  0.695489   
8                       GaussianNB  0.882835   0.598632  0.793186  0.630090   
23   QuadraticDiscriminantAnalysis  0.855888   0.586566  0.806909  0.608428   
21     PassiveAggressiveClassifier  0.862917   0.581563  0.768926  0.603216   
20                 NearestCentroid  0.838899   0.580603  0.811939  0.595628   
6              ExtraTreeClassifier  0.930873   0.582468  0.596101  0.588591   
2                      BernoulliNB  0.811951   0.577274  0.839545  0.5




In [92]:
def run_experiment(X_train_scaled, X_test_scaled, y_train, y_test):
    # Get all classification model classes
    classifiers = all_estimators(type_filter='classifier')

    # Initialize result table
    results = []
    models = {}
    # Run models and collect results
    for name, ClassifierClass in tqdm(classifiers):
        try:
            # Initialize model
            model = ClassifierClass()
            model.fit(X_train, y_train)
            models[name] = model
            y_pred = model.predict(X_test)
            
            # Calculate metrics
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')
            auc = roc_auc_score(y_test, y_pred)
            balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
            
            # Append results
            results.append([name, accuracy, precision, recall, f1, auc, balanced_accuracy])
        except Exception as e:
            print(f"Error occurred for {name}: {str(e)}")

    # Create a DataFrame from results
    results_df = pd.DataFrame(results, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score", "AUC", "Balanced Accuracy"])
    results_df = results_df.sort_values(by=['F1-score', 'AUC'], ascending=False)
    return results_df, models


In [93]:
result_df, models = run_experiment(X_train_scaled, X_test_scaled, y_train, y_test)

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 43/43 [00:00<00:00, 280.97it/s]


Error occurred for AdaBoostClassifier: could not convert string to float: 'Other'
Error occurred for BaggingClassifier: could not convert string to float: 'Other'
Error occurred for BernoulliNB: could not convert string to float: 'Other'
Error occurred for CalibratedClassifierCV: could not convert string to float: 'Other'
Error occurred for CategoricalNB: invalid literal for int() with base 10: 'Other'
Error occurred for ClassifierChain: ClassifierChain.__init__() missing 1 required positional argument: 'base_estimator'
Error occurred for ComplementNB: could not convert string to float: 'Other'
Error occurred for DecisionTreeClassifier: could not convert string to float: 'Other'
Error occurred for ExtraTreeClassifier: could not convert string to float: 'Other'
Error occurred for ExtraTreesClassifier: could not convert string to float: 'Other'
Error occurred for FixedThresholdClassifier: FixedThresholdClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurr

In [94]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,AUC,Balanced Accuracy
15,LinearDiscriminantAnalysis,0.950791,0.688051,0.703655,0.695489,0.703655,0.703655
8,GaussianNB,0.882835,0.598632,0.793186,0.63009,0.793186,0.793186
23,QuadraticDiscriminantAnalysis,0.855888,0.586566,0.806909,0.608428,0.806909,0.806909
21,PassiveAggressiveClassifier,0.862917,0.581563,0.768926,0.603216,0.768926,0.768926
20,NearestCentroid,0.838899,0.580603,0.811939,0.595628,0.811939,0.811939
6,ExtraTreeClassifier,0.930873,0.582468,0.596101,0.588591,0.596101,0.596101
2,BernoulliNB,0.811951,0.577274,0.839545,0.582069,0.839545,0.839545
11,HistGradientBoostingClassifier,0.955477,0.655801,0.553392,0.575518,0.553392,0.553392
17,LogisticRegression,0.958992,0.715012,0.548283,0.572823,0.548283,0.548283
3,CalibratedClassifierCV,0.957235,0.67609,0.547367,0.569489,0.547367,0.547367


In [95]:
results_df.to_csv(r"D:\BIG_DATA\charging_stations\task_2\all_cities_random_shuffle.csv", index=False)

In [97]:
from tqdm import tqdm

# List of big cities and small cities
big_cities = ['Rome', 'Milan']
small_cities = ['Catania', 'Florence', 'Turin']

# Process for big cities
for city in tqdm(big_cities):
    test_cities = [city]
    train_cities = [x for x in big_cities if x != city]
    X_train, X_test, y_train, y_test = data_splitter(data, train_cities=train_cities, test_cities=test_cities)

    # Define categorical and numerical features
    categorical_features = ['landuse', 'oneway_exists', 'high_way']
    numerical_features = [col for col in X_train.columns if col not in categorical_features]

    # Create a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])
    
    # Fit and transform the training data, and transform the test data
    X_train_scaled = preprocessor.fit_transform(X_train)
    X_test_scaled = preprocessor.transform(X_test)
    results_df, models = run_experiment(X_train_scaled, X_test_scaled, y_train, y_test)
    results_df.to_csv(r"D:\BIG_DATA\charging_stations\task_2\big_cities_test_city_{city}_.csv", index=False)

# Process for small cities
for city in tqdm(small_cities):
    test_cities = [city]
    train_cities = [x for x in small_cities if x != city]
    X_train, X_test, y_train, y_test = data_splitter(data, train_cities=train_cities, test_cities=test_cities)

    # Define categorical and numerical features
    categorical_features = ['landuse', 'oneway_exists', 'high_way']
    numerical_features = [col for col in X_train.columns if col not in categorical_features]

    # Create a column transformer
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', StandardScaler(), numerical_features),
            ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
        ])
    
    # Fit and transform the training data, and transform the test data
    X_train_scaled = preprocessor.fit_transform(X_train)
    X_test_scaled = preprocessor.transform(X_test)
    results_df, models = run_experiment(X_train_scaled, X_test_scaled, y_train, y_test)
    results_df.to_csv(r"D:\BIG_DATA\charging_stations\task_2\small_cities_test_city_{city}_.csv", index=False)


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 43/43 [00:00<00:00, 919.51it/s]


Error occurred for AdaBoostClassifier: could not convert string to float: 'industrial'
Error occurred for BaggingClassifier: could not convert string to float: 'industrial'
Error occurred for BernoulliNB: could not convert string to float: 'industrial'
Error occurred for CalibratedClassifierCV: could not convert string to float: 'Other'
Error occurred for CategoricalNB: invalid literal for int() with base 10: 'industrial'
Error occurred for ClassifierChain: ClassifierChain.__init__() missing 1 required positional argument: 'base_estimator'
Error occurred for ComplementNB: could not convert string to float: 'industrial'
Error occurred for DecisionTreeClassifier: could not convert string to float: 'industrial'
Error occurred for ExtraTreeClassifier: could not convert string to float: 'industrial'
Error occurred for ExtraTreesClassifier: could not convert string to float: 'industrial'
Error occurred for FixedThresholdClassifier: FixedThresholdClassifier.__init__() missing 1 required posit


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Error occurred for AdaBoostClassifier: could not convert string to float: 'Other'
Error occurred for BaggingClassifier: could not convert string to float: 'Other'
Error occurred for BernoulliNB: could not convert string to float: 'Other'
Error occurred for CalibratedClassifierCV: could not convert string to float: 'Other'
Error occurred for CategoricalNB: invalid literal for int() with base 10: 'Other'
Error occurred for ClassifierChain: ClassifierChain.__init__() missing 1 required positional argument: 'base_estimator'
Error occurred for ComplementNB: could not convert string to float: 'Other'
Error occurred for DecisionTreeClassifier: could not convert string to float: 'Other'
Error occurred for ExtraTreeClassifier: could not convert string to float: 'Other'
Error occurred for ExtraTreesClassifier: could not convert string to float: 'Other'
Error occurred for FixedThresholdClassifier: FixedThresholdClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurr


100%|██████████| 43/43 [00:00<00:00, 374.36it/s]
100%|██████████| 2/2 [00:00<00:00,  5.70it/s]


Error occurred for NuSVC: could not convert string to float: 'Other'
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for PassiveAggressiveClassifier: could not convert string to float: 'Other'
Error occurred for Perceptron: could not convert string to float: 'Other'
Error occurred for QuadraticDiscriminantAnalysis: could not convert string to float: 'Other'
Error occurred for RadiusNeighborsClassifier: could not convert string to float: 'Other'
Error occurred for RandomForestClassifier: could not convert string to float: 'Other'
Error occurred for RidgeClassifier: could not convert string to float: 'Other'
Error occurred for RidgeClassifierCV: could not co


[A

Error occurred for AdaBoostClassifier: could not convert string to float: 'Other'
Error occurred for BaggingClassifier: could not convert string to float: 'Other'
Error occurred for BernoulliNB: could not convert string to float: 'Other'
Error occurred for CalibratedClassifierCV: could not convert string to float: 'Other'
Error occurred for CategoricalNB: invalid literal for int() with base 10: 'Other'
Error occurred for ClassifierChain: ClassifierChain.__init__() missing 1 required positional argument: 'base_estimator'
Error occurred for ComplementNB: could not convert string to float: 'Other'
Error occurred for DecisionTreeClassifier: could not convert string to float: 'Other'
Error occurred for ExtraTreeClassifier: could not convert string to float: 'Other'
Error occurred for ExtraTreesClassifier: could not convert string to float: 'Other'
Error occurred for FixedThresholdClassifier: FixedThresholdClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurr

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 43/43 [00:00<00:00, 904.96it/s]



Error occurred for AdaBoostClassifier: could not convert string to float: 'Other'


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 43/43 [00:00<00:00, 512.59it/s]


Error occurred for BaggingClassifier: could not convert string to float: 'Other'
Error occurred for BernoulliNB: could not convert string to float: 'Other'
Error occurred for CalibratedClassifierCV: could not convert string to float: 'Other'
Error occurred for CategoricalNB: invalid literal for int() with base 10: 'Other'
Error occurred for ClassifierChain: ClassifierChain.__init__() missing 1 required positional argument: 'base_estimator'
Error occurred for ComplementNB: could not convert string to float: 'Other'
Error occurred for DecisionTreeClassifier: could not convert string to float: 'Other'
Error occurred for ExtraTreeClassifier: could not convert string to float: 'Other'
Error occurred for ExtraTreesClassifier: could not convert string to float: 'Other'
Error occurred for FixedThresholdClassifier: FixedThresholdClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for GaussianNB: could not convert string to float: 'Other'
Error occurred for G


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
100%|██████████| 43/43 [00:00<00:00, 878.63it/s]
100%|██████████| 3/3 [00:00<00:00,  9.20it/s]

Error occurred for AdaBoostClassifier: could not convert string to float: 'Other'
Error occurred for BaggingClassifier: could not convert string to float: 'Other'
Error occurred for BernoulliNB: could not convert string to float: 'Other'
Error occurred for CalibratedClassifierCV: could not convert string to float: 'Other'
Error occurred for CategoricalNB: invalid literal for int() with base 10: 'Other'
Error occurred for ClassifierChain: ClassifierChain.__init__() missing 1 required positional argument: 'base_estimator'
Error occurred for ComplementNB: could not convert string to float: 'Other'
Error occurred for DecisionTreeClassifier: could not convert string to float: 'Other'
Error occurred for ExtraTreeClassifier: could not convert string to float: 'Other'
Error occurred for ExtraTreesClassifier: could not convert string to float: 'Other'
Error occurred for FixedThresholdClassifier: FixedThresholdClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurr




In [None]:
"""
Rome, Milan: Big CITY EXP-1
Turin, Florence, Catania: EXP-2
"""

# EXP-1
big_cities = ['Rome', 'Milan']
small_cities = ['Catania', 'Florence', 'Turin']


# make a table in the end to summarise the results of all experiments

# big cities splited in train and test where only one big city is test and all possible combinations for this
for city in tqdm(big_cities):
    test_cities = [city]
    train_cities = [x for x in big_cities if x != city]
    X_train, X_test, y_train, y_test = data_splitter(data, train_cities=train_cities, test_cities=test_cities)
    results_df, models = run_experiment(X_train, X_test, y_train, y_test)
    results_df.to_csv(r"charging_stations/task_2/big_cities_test_city_{city}_.csv", index=False)
    

# small cities splited in train and test where only one small city is test and all possible combinations for this
for city in tqdm(small_cities):
    test_cities = [city]
    train_cities = [x for x in small_cities if x != city]
    X_train, X_test, y_train, y_test = data_splitter(data, train_cities=train_cities, test_cities=test_cities)
    results_df, models = run_experiment(X_train, X_test, y_train, y_test)
    results_df.to_csv(r"charging_stations/task_2/small_cities_test_city_{city}_.csv", index=False)

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 51 is out of bounds for axis 1 with size 51
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pleas

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([   2,    9,   17, ..., 3877, 3878, 3879]), you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.
Error occurred for StackingClassifier: StackingClassifier.__init__() missing 1 required positional argument: 'estimators'
Error occurred for VotingClassifier: VotingClassifier.__init__() missing 1 required pos

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 53 is out of bounds for axis 1 with size 48
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pleas

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([  15,   17,   22, ..., 1409, 1410, 1420]), you can try using larger radius, giving a label for outliers, or considering removing them from your dataset.
Error occurred for StackingClassifier: StackingClassifier.__init__() missing 1 required positional argument: 'estimators'
Error occurred for VotingClassifier: VotingClassifier.__init__() missing 1 required pos

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 13 is out of bounds for axis 1 with size 10
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  _warn_prf(average, modifier, msg_start, len(result))
  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://s

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([ 11,  20,  21,  23,  27,  30,  31,  32,  39,  40,  41,  42,  43,
        44,  45,  46,  49,  50,  51,  52,  54,  55,  56,  57,  58,  59,
        64,  65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  77,
        79,  80,  81,  82,  84,  85,  86,  95,  98,  99, 100, 101, 102,
       103, 104, 106, 107, 108, 109, 110, 111, 112, 119, 122, 123, 124,
       12

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 12 is out of bounds for axis 1 with size 11
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pleas

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([   6,   12,   13,   19,   20,   21,   27,   28,   29,   30,   31,
         32,   33,   41,   43,   44,   45,   46,   55,   56,   57,   58,
         59,   60,   61,   66,   67,   68,   69,   70,   71,   79,   80,
         81,   82,   83,   84,   86,   88,   89,   90,   91,   92,   93,
         94,   95,   96,   97,  101,  102,  103,  104,  105,  106,  107,
    

  0%|          | 0/4 [00:00<?, ?it/s]

  0%|          | 0/41 [00:00<?, ?it/s]

Error occurred for AdaBoostClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by AdaBoostClassifier.
Error occurred for BaggingClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by BaggingClassifier.
Error occurred for BernoulliNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by BernoulliNB.


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Error occurred for CalibratedClassifierCV: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by LinearSVC.
Error occurred for CategoricalNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by CategoricalNB.
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'
Error occurred for ComplementNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by ComplementNB.
Error occurred for DecisionTreeClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by DecisionTreeClassifier.
Error occurred for DummyClassifier: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.
Error occurred for ExtraTreeClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by ExtraTreeClassifier.
Error occurred for ExtraTreesClassifier: Found array with 0 sample(s) (shape=(0, 19)) whil

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Error occurred for LogisticRegression: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by LogisticRegression.


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Error occurred for LogisticRegressionCV: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by LogisticRegressionCV.
Error occurred for MLPClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by MLPClassifier.
Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for MultinomialNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by MultinomialNB.
Error occurred for NearestCentroid: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by NearestCentroid.
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifie

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 87 is out of bounds for axis 1 with size 46
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Pleas

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([  7,  11,  15,  21,  25,  27,  28,  29,  39,  40,  41,  42,  43,
        47,  51,  52,  53,  54,  55,  58,  59,  63,  64,  65,  66,  70,
        71,  72,  73,  74,  75,  76,  82,  83,  84,  85,  86,  87,  88,
        90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 112, 115, 116, 120, 121, 122, 123,
       12

  _warn_prf(average, modifier, msg_start, len(result))


  0%|          | 0/41 [00:00<?, ?it/s]



Error occurred for AdaBoostClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by AdaBoostClassifier.
Error occurred for BaggingClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by BaggingClassifier.
Error occurred for BernoulliNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by BernoulliNB.


  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)


Error occurred for CalibratedClassifierCV: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by LinearSVC.
Error occurred for CategoricalNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by CategoricalNB.
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'
Error occurred for ComplementNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by ComplementNB.
Error occurred for DecisionTreeClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by DecisionTreeClassifier.
Error occurred for DummyClassifier: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.
Error occurred for ExtraTreeClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by ExtraTreeClassifier.
Error occurred for ExtraTreesClassifier: Found array with 0 sample(s) (shape=(0, 19)) whil

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Error occurred for LogisticRegressionCV: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by LogisticRegressionCV.
Error occurred for MLPClassifier: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by MLPClassifier.
Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for MultinomialNB: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by MultinomialNB.
Error occurred for NearestCentroid: Found array with 0 sample(s) (shape=(0, 19)) while a minimum of 1 is required by NearestCentroid.
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifie

  0%|          | 0/41 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


Error occurred for CategoricalNB: index 47 is out of bounds for axis 1 with size 33
Error occurred for ClassifierChain: _BaseChain.__init__() missing 1 required positional argument: 'base_estimator'


  _warn_prf(average, modifier, msg_start, len(result))
  probabilities /= normalizer
  probabilities /= normalizer
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://s

Error occurred for MultiOutputClassifier: MultiOutputClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for NuSVC: specified nu is infeasible
Error occurred for OneVsOneClassifier: OneVsOneClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OneVsRestClassifier: OneVsRestClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for OutputCodeClassifier: OutputCodeClassifier.__init__() missing 1 required positional argument: 'estimator'
Error occurred for RadiusNeighborsClassifier: No neighbors found for test samples array([  0,   2,   3,   8,  11,  12,  14,  16,  17,  19,  21,  22,  24,
        26,  27,  28,  29,  30,  33,  35,  36,  37,  38,  39,  40,  43,
        44,  45,  46,  47,  48,  49,  50,  54,  56,  57,  58,  59,  60,
        61,  62,  63,  64,  65,  66,  67,  68,  72,  73,  74,  75,  77,
        78,  79,  80,  81,  82,  83,  84,  86,  87,  89,  90,  91,  92,
        9

  _warn_prf(average, modifier, msg_start, len(result))


In [98]:
results_df

Unnamed: 0,Model,Accuracy,Precision,Recall,F1-score,AUC,Balanced Accuracy
0,DummyClassifier,0.868637,0.434319,0.5,0.464851,0.5,0.5


In [20]:
import pandas as pd
import glob

# Get a list of all result files from different experiments
result_files = glob.glob("/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/*.csv")

# Create a dictionary to store the total AUC and count for each model
auc_sum_per_model = {}
count_per_model = {}

# Iterate over each result file
for file in result_files:
    print(file)
    # Load the results for each experiment
    results = pd.read_csv(file)
    
    # Iterate over each row in the results
    for _, row in results.iterrows():
        model = row['Model']
        auc = row['AUC']
        
        # Update the total AUC and count for the model
        if model in auc_sum_per_model:
            auc_sum_per_model[model] += auc
            count_per_model[model] += 1
        else:
            auc_sum_per_model[model] = auc
            count_per_model[model] = 1

# Calculate the average AUC for each model
average_auc_per_model = {model: auc_sum_per_model[model] / count_per_model[model] for model in auc_sum_per_model}

# Create a DataFrame from the average AUC dictionary
average_auc_df = pd.DataFrame(list(average_auc_per_model.items()), columns=['Model', 'Average AUC'])

# Sort the DataFrame by Average AUC in descending order
sorted_models = average_auc_df.sort_values(by='Average AUC', ascending=False)

# Select the top 5 models
top_5_models = sorted_models.head(5)

# Display the best models
print(top_5_models)


/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/all_cities_random_shuffle.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/small_cities_test_city_Mainz_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/small_cities_test_city_Trier_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/big_cities_test_city_Munich_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/big_cities_test_city_Berlin_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/small_cities_test_city_Saarbrücken_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/big_cities_test_city_Stuttgart_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/small_cities_test_city_Karlsruhe_.csv
/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/big_cities_test_city_Frankfurt_.csv
                     Model  Average AUC
1              BernoulliNB     0.784188
13         NearestCentroid     0.767735
8             ComplementNB     0.661513
4            MultinomialNB     0.657249


In [23]:
top_5_models

['BernoulliNB',
 'NearestCentroid',
 'ComplementNB',
 'MultinomialNB',
 'DecisionTreeClassifier']

In [31]:
import pandas as pd
import glob

# Get a list of all result files from different experiments
result_files = glob.glob("/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/*.csv")

# Create an empty DataFrame to store the combined results
combined_results = pd.DataFrame()

# Create an empty DataFrame to store the summary
summary_results = pd.DataFrame(columns=['type_city'])

# Iterate over each result file
for type_city in ['big', 'small', 'all']:
    # Reset combined_results for each type_city iteration
    combined_results = pd.DataFrame()

    # Iterate over each result file
    for file in result_files:
        # Load the results for each experiment
        if type_city in file:
            results = pd.read_csv(file)
            
            # Append the results to the combined DataFrame
            combined_results = combined_results.append(results)

    # Calculate the average AUC for each model
    average_auc_per_model = combined_results.groupby('Model')['AUC'].mean()
    
    # Sort the models by average AUC in descending order
    sorted_models = average_auc_per_model.sort_values(ascending=False)
    
    # Filter the results to include only the rows corresponding to the top 5 models
    filtered_results = combined_results[combined_results['Model'].isin(top_5_models)]

    # Calculate the average AUC for each model
    average_auc_by_model = filtered_results.groupby('Model')['AUC'].mean()
    
    # Create a row with type_city and average AUC values for each model
    row = {'type_city': type_city}
    row.update(average_auc_by_model)
    
    # Append the row to the summary_results DataFrame
    summary_results = summary_results.append(row, ignore_index=True)

# Display the summary_results DataFrame
print(summary_results)


  type_city  BernoulliNB  ComplementNB  DecisionTreeClassifier  MultinomialNB  \
0       big     0.764115      0.669090                0.621317       0.653853   
1     small     0.810223      0.654498                0.681870       0.666221   
2       all     0.810952      0.651410                0.672189       0.661778   

   NearestCentroid  
0         0.715620  
1         0.853362  
2         0.837221  


  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  summary_results = summary_results.append(row, ignore_index=True)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  summary_results = summary_results.append(row, ignore_index=True)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  summary_results = summary_results.append(row, ignore_index=True)


In [32]:
summary_results

Unnamed: 0,type_city,BernoulliNB,ComplementNB,DecisionTreeClassifier,MultinomialNB,NearestCentroid
0,big,0.764115,0.66909,0.621317,0.653853,0.71562
1,small,0.810223,0.654498,0.68187,0.666221,0.853362
2,all,0.810952,0.65141,0.672189,0.661778,0.837221


In [40]:
import pandas as pd
import glob

# Get a list of all result files from different experiments
result_files = glob.glob("/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/*.csv")

# Create an empty DataFrame to store the combined results
combined_results = pd.DataFrame()

# Create an empty DataFrame to store the summary
summary_results = pd.DataFrame(columns=['type_city', 'AUC', 'Accuracy', 'Precision', 'Recall'])

# Iterate over each result file
for type_city in ['big', 'small', 'all']:
    # Reset combined_results for each type_city iteration
    combined_results = pd.DataFrame()

    # Iterate over each result file
    for file in result_files:
        # Load the results for each experiment
        if type_city in file:
            results = pd.read_csv(file)
            
            # Append the results to the combined DataFrame
            combined_results = combined_results.append(results)

    # Filter the results to include only the rows corresponding to the top 5 models
    filtered_results = combined_results[combined_results['Model'].isin(top_5_models)]

    # Calculate the average values for each metric
    average_metrics_per_model = filtered_results.groupby('Model')['AUC', 'Accuracy', 'Precision', 'Recall'].mean()

    # Calculate the average values for each metric
    average_values = average_metrics_per_model.mean()

    # Create a row with type_city, average values for each metric
    row = {'type_city': type_city}
    for metric in ['AUC', 'Accuracy', 'Precision', 'Recall']:
        row[metric] = average_values[metric]

    # Append the row to the summary_results DataFrame
    summary_results = summary_results.append(row, ignore_index=True)

# Display the summary_results DataFrame
summary_results


  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  average_metrics_per_model = filtered_results.groupby('Model')['AUC', 'Accuracy', 'Precision', 'Recall'].mean()
  summary_results = summary_results.append(row, ignore_index=True)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  average_metrics_per_model = filtered_results.groupby('Model')['AUC', 'Accuracy', 'Precision', 'Recall'].mean()
  summary_results = summary_results.append(row, ignore_index=True)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(results)
  combined_results = combined_results.append(r

Unnamed: 0,type_city,AUC,Accuracy,Precision,Recall
0,big,0.684799,0.813313,0.6492,0.684799
1,small,0.733235,0.884658,0.617062,0.733235
2,all,0.72671,0.876493,0.62105,0.72671


In [46]:
top_5_models

['BernoulliNB',
 'NearestCentroid',
 'ComplementNB',
 'MultinomialNB',
 'DecisionTreeClassifier']

In [45]:
summary_results.to_csv("/home/bahramkhanbaloch/media3/MS/Semester1/DS/results/summary_results.csv", index=False)