In [65]:
import pandas as pd
import numpy as np
import plotly.express as px
import matplotlib.pyplot as plt
import seaborn as sns
import time
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score, f1_score, classification_report

# k-NN
from sklearn.neighbors import KNeighborsClassifier

# Naive Bayes
from sklearn.naive_bayes import GaussianNB

# AdaBoost
from sklearn.ensemble import AdaBoostClassifier

# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier

# Linear Discriminant Analysis (LDA)
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Decision Tree
from sklearn.tree import DecisionTreeClassifier

# Random Forest
from sklearn.ensemble import RandomForestClassifier

# XGBoost
from xgboost import XGBClassifier

# CatBoost
from catboost import CatBoostClassifier
from imblearn.over_sampling import RandomOverSampler


In [66]:
df=pd.read_csv(r"D:\Y2S1\project\dataSets\bcsc_risk_factors_summarized1_092020\bcsc_risk_factors_summarized1_092020.csv")

In [67]:
df

Unnamed: 0,year,age_group_5_years,race_eth,first_degree_hx,age_menarche,age_first_birth,BIRADS_breast_density,current_hrt,menopaus,bmi_group,biophx,breast_cancer_history,count
0,2013,7,1,0,9,3,1,1,2,3,0,0,7
1,2013,7,1,0,9,3,1,1,2,3,1,0,3
2,2013,7,1,0,9,3,1,1,2,4,0,0,6
3,2013,7,1,0,9,3,1,1,2,4,1,0,1
4,2013,7,1,0,9,3,1,1,2,4,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
509405,2017,13,9,9,9,9,1,9,2,9,1,0,1
509406,2017,13,9,9,9,9,2,0,2,9,0,0,1
509407,2017,13,9,9,9,9,2,9,2,9,9,9,1
509408,2017,13,9,9,9,9,3,0,2,9,0,0,1


In [68]:
df=df.iloc[:10000]

## functions 

In [69]:
def column_info(data):
    for x in data.columns:
        print(f"column \"{x}\"'s info")
        print("-"*30)
        print(f"the num of nan values: {data[x].isna().sum()}")
        print(f"the unique values: {data[x].unique()}")
        print(f"the type is: {data[x].dtype}")
        print("="*30)






def nan_rows(df):
    nan_counts = []
    nan_percentages = []
    
    for index, row in df.iterrows():
        nan_count = row.isna().sum()
        nan_percentage = ((nan_count / df.shape[1]) * 100).round(2)# why we don't make it with mean : as it counts the values of the columns, not the size  
        
        
        nan_counts.append(nan_count)
        nan_percentages.append(nan_percentage)
    
    df_nan = pd.DataFrame({
        "NaN Count": nan_counts,
        "Percentage of NaNs (%)": nan_percentages
    }, index=df.index)
    
    return df_nan

def nan_row_check(df, df_nan, threshold=50, interactive=False):

    for index in df_nan.index:
        if df_nan.loc[index, "Percentage of NaNs (%)"] > threshold:
            if interactive:
                print(f"Row {index} info:\n{df.loc[index]}")
                x = input(f"Row {index} exceeds {threshold}% NaNs. Drop it? ('Y/N'): ").strip().upper()
            else:
                x = "Y"
            if x == "Y":
                df = df.drop(index, axis=0)
    return df.reset_index(drop=True)



def nan_filling(df):
    # Handle categorical columns
    categorical_cols = df.select_dtypes(include=["object", "category"])
    if not categorical_cols.empty:
        modes = categorical_cols.mode().iloc[0]  # Get the first row of the mode DataFrame
        df[categorical_cols.columns] = categorical_cols.fillna(modes)

    # Handle numeric columns
    numeric_cols = df.select_dtypes(include="number")
    if not numeric_cols.empty:
        medians = numeric_cols.median()
        df[numeric_cols.columns] = numeric_cols.fillna(medians)

    return df.reset_index



def nan_group_filling(df, group_by_column):# 
    # Handle categorical columns
    if df[group_by_column].dtype.kind in 'iufc':  # Check if numeric
            df[group_by_column]=df[group_by_column].fillna(df[group_by_column].median())#i made this line as if we group by column with nan values will not belong to any group
    else:
            df[group_by_column]=df[group_by_column].fillna(df[group_by_column].mode())#i made this line as if we group by column with nan values will not belong to any group



    
    categorical_cols = df.select_dtypes(include=["object", "category"]).columns
    
    for col in categorical_cols:
        df[col]=df.groupby(group_by_column)[col].transform(lambda x:x.fillna(x.mode().iloc[0] if x.mode() else np.nan))


    
    # Handle numeric columns
    numeric_cols = df.select_dtypes(include="number").columns
    for col in numeric_cols:
        # Get the median per group
        median_per_group = df.groupby(group_by_column)[col].median()
        # Fill missing values with the median for each group
        df[col] = df.groupby(group_by_column)[col].transform(lambda x: x.fillna(median_per_group[x.name]))

    return df



def nan_columns(df):
    nan_counts = df.isna().sum()
    nan_percentages = (df.isna().mean() * 100).round(2)
    
    df_nan = pd.DataFrame({
        'NaN Count': nan_counts,
        'Percentage of NaNs (%)': nan_percentages
    })
    return df_nan

def nan_columns_check(df, df_nan, threshold=50, interactive=False):
   
    for column in df_nan.index:
        if df_nan.loc[column, "Percentage of NaNs (%)"] > threshold:
            if interactive:
                print(f"Column '{column}' info:\n{df[column]}")
                x = input(f"Column '{column}' exceeds {threshold}% NaNs. Drop it? ('Y/N'): ").strip().upper()
            else:
                x = "Y"
            if x == "Y":
                df = df.drop(column, axis=1)
    return df

# this function for the columns like zip code or id as we can't deal with these nan values by mean or median as they are sensitve data and can't be predicted 
def unique_columns(df, column):
    nan_values_index = df[df[column].isna()].index
    df = df.drop(nan_values_index).reset_index(drop=True)
    return df

def nan_replace(x, data,columns):# here i put the columns here as in this data the x may indicate nan values in columns but in other columns it indicates actual values 
    for column in columns:
        
        data[column].replace(x, np.nan, inplace=True)
    return data
        
# here if we want to ddeal with the rows at first and then the columns 
def data_cleaning(df, column):
    start_time = time.time()  # Start timing
    df_nan_columns = nan_columns(df)
    df_nan_rows = nan_rows(df)
    df = nan_row_check(df, df_nan_rows)
    df = nan_columns_check(df, df_nan_columns)
    df = unique_columns(df, column)
    df = nan_filling(df)
    end_time = time.time()
    return df 
    
    # End timing
    
    processing_time = end_time - start_time
    print(f"Data cleaning completed in {processing_time:.4f} seconds.")
    return df

def data_cleaning2(df,group_by_column):
    start_time = time.time()  # Start timing
    df_nan_columns = nan_columns(df)
    df_nan_rows = nan_rows(df)
    df = nan_columns_check(df, df_nan_columns)
    df = nan_row_check(df, df_nan_rows)
    df = nan_group_filling(df,group_by_column)
    end_time = time.time()  # End timing
    
    processing_time = end_time - start_time
    print(f"Data cleaning (method 2) completed in {processing_time:.4f} seconds.")
    return df

# here if we want to deal with the columns at first and then the rows 

def data_cleaning3(df, column):
    
    start_time = time.time()  # Start timing
    df_nan_columns = nan_columns(df)
    df_nan_rows = nan_rows(df)
    df = nan_columns_check(df, df_nan_columns)
    df = nan_row_check(df, df_nan_rows)
    
    df = unique_columns(df, column)
    df = nan_filling(df)
    end_time = time.time()
    return df 
    
    # End timing
    
    processing_time = end_time - start_time
    print(f"Data cleaning completed in {processing_time:.4f} seconds.")
    return df
df_nan_rows=nan_rows(df)
df_nan_columns=nan_columns(df)

## this one is for the nan values method 2
def data_cleaning4(df):
    start_time = time.time()  # Start timing
    df_nan_columns = nan_columns(df)
    df_nan_rows = nan_rows(df)
    df = nan_row_check(df, df_nan_rows)
    df = nan_columns_check(df, df_nan_columns)
    
    end_time = time.time()

    return df
def nan_columns_bar(df_nan):
    # Create a bar chart using Plotly with a custom color
    fig = px.bar(
        df_nan, 
        x=df_nan.index, 
        y='NaN Count', 
        text='Percentage of NaNs (%)', 
        height=500, 
        width=600
    )
    
    # Update layout to match your desired style
    fig.update_layout(
        title="NaN Count per Column",
        xaxis_title="Features",
        yaxis_title=f'NaN Count out of ({df_nan.shape[ 0]})',
        template="plotly_white"
    )
    
    # Update the x-axis labels to rotate them
    fig.update_xaxes(tickangle=70)
    
    # Automatically display percentage on top of bars
    fig.update_traces(texttemplate='%{text:.1f}%', textposition='inside')
    
    # Display the chart
    fig.show()



def find_outliers(s):
        q1 = s.quantile(0.25)
        q3 = s.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - 1.5 * iqr
        upper_bound = q3 + 1.5 * iqr
        outliers = s[(s < lower_bound) | (s > upper_bound)]
        return outliers



def find_grouped_outliers(df, group_by_col, target_col):
   
    outliers = df.groupby(group_by_col)[target_col].apply(find_outliers)
    
    # Flatten the MultiIndex and drop NaN values
    outliers = outliers.reset_index(level=0, drop=True).dropna()
    
    # Return both the outlier indices and the outliers DataFrame
    return outliers.index, df.loc[outliers.index]


# Define a function to calculate the mode, handling cases with multiple modes
def fill_mode(series):
    if not series.empty and series.mode().size > 0:
        return series.mode()[0]
    else:
        return series
    






def visualize_outliers(df):
    # df_outlier=df.dropna()
    numeric_columns=df.select_dtypes(["number"]).columns
    for i in numeric_columns:
        plt.figure(figsize=(10,6))
        sns.boxplot(x="race_eth", y=i,data=df)
        plt.xticks(rotation=45)  
        plt.show()


def evaluate_models(data, target_column, test_size=0.2):
    # 1. Separate features and target
    X = data.drop(columns=[target_column])
    y = data[target_column]
    
    # 2. Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
    
    # 3. Preprocessing pipelines for numeric and categorical features
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_features = X.select_dtypes(include=['object', 'category']).columns
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # Handle NaN by imputing the mean
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline(steps=[('encoder', OneHotEncoder(drop='first'))])
    
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numeric_transformer, numeric_features),
            ('cat', categorical_transformer, categorical_features)])
    
    # 4. Define models
    models = {
        "Linear Regression": LinearRegression(),
        "KNN Regressor": KNeighborsRegressor(), 
        "SVR": SVR(),
        "Decision Tree": DecisionTreeRegressor(),
        "Random Forest": RandomForestRegressor(),
        "XGBoost Regressor": XGBRegressor()
    }
    
    best_score = -np.inf
    best_pipeline = None
    best_model_name = ""
    
    # 5. Evaluate each model
    for name, model in models.items():
        print(f'Using model: {name}')
        
        # Create pipeline with preprocessor and model
        pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                   ('model', model)])
        
        # Train the model
        pipeline.fit(X_train, y_train)
        
        # Evaluate the model
        y_pred = pipeline.predict(X_test)
        test_score = r2_score(y_test, y_pred)
        
        # Print evaluation metrics
        print(f'Test R2 Score: {test_score:.4f}')
        print(f'RMSE: {np.sqrt(mean_squared_error(y_test, y_pred)):.4f}')
        print('-'*30)
        
        # Keep track of the best model
        if test_score > best_score:
            best_score = test_score
            best_pipeline = pipeline
            best_model_name = name
    
    print(f'Best model: {best_model_name} with R2 Score: {best_score:.4f}')
    return best_pipeline



In [70]:
df = nan_replace(9, df, ["breast_cancer_history", "biophx", "bmi_group", "menopaus", 
                         "current_hrt", "BIRADS_breast_density", "age_first_birth", 
                         "age_menarche", "first_degree_hx", "race_eth"])


A value is trying to be set on a copy of a DataFrame or Series through chained assignment using an inplace method.
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.





A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



## EDA

In [71]:

column_info(df)


column "year"'s info
------------------------------
the num of nan values: 0
the unique values: [2013]
the type is: int64
column "age_group_5_years"'s info
------------------------------
the num of nan values: 0
the unique values: [7]
the type is: int64
column "race_eth"'s info
------------------------------
the num of nan values: 0
the unique values: [1 2 3]
the type is: int64
column "first_degree_hx"'s info
------------------------------
the num of nan values: 585
the unique values: [ 0.  1. nan]
the type is: float64
column "age_menarche"'s info
------------------------------
the num of nan values: 4736
the unique values: [nan  0.  1.  2.]
the type is: float64
column "age_first_birth"'s info
------------------------------
the num of nan values: 1831
the unique values: [ 3.  4. nan  0.  1.  2.]
the type is: float64
column "BIRADS_breast_density"'s info
------------------------------
the num of nan values: 737
the unique values: [ 1.  2.  3.  4. nan]
the type is: float64
column "curren

### outlier visulization

In [72]:

# visualize_outliers(df)

### nan value visulization 

In [73]:
df_nan_columns=nan_columns(df)
nan_columns_bar(df_nan_columns)

In [74]:



df=data_cleaning2(df,"race_eth")


Data cleaning (method 2) completed in 2.4035 seconds.


In [75]:
df

Unnamed: 0,year,age_group_5_years,race_eth,first_degree_hx,age_menarche,age_first_birth,BIRADS_breast_density,current_hrt,menopaus,bmi_group,biophx,breast_cancer_history,count
0,2013,7,1,0.0,1.0,3.0,1.0,1.0,2.0,3.0,0.0,0.0,7
1,2013,7,1,0.0,1.0,3.0,1.0,1.0,2.0,3.0,1.0,0.0,3
2,2013,7,1,0.0,1.0,3.0,1.0,1.0,2.0,4.0,0.0,0.0,6
3,2013,7,1,0.0,1.0,3.0,1.0,1.0,2.0,4.0,1.0,0.0,1
4,2013,7,1,0.0,1.0,3.0,1.0,1.0,2.0,4.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9898,2013,7,3,0.0,1.0,4.0,3.0,0.0,2.0,2.0,0.0,0.0,5
9899,2013,7,3,0.0,1.0,4.0,3.0,0.0,2.0,4.0,1.0,1.0,1
9900,2013,7,3,0.0,1.0,4.0,3.0,0.0,2.0,1.0,0.0,0.0,7
9901,2013,7,3,0.0,1.0,4.0,3.0,0.0,2.0,1.0,1.0,0.0,2


In [76]:
df_nan_columns=nan_columns(df)
nan_columns_bar(df_nan_columns)

In [77]:
df

Unnamed: 0,year,age_group_5_years,race_eth,first_degree_hx,age_menarche,age_first_birth,BIRADS_breast_density,current_hrt,menopaus,bmi_group,biophx,breast_cancer_history,count
0,2013,7,1,0.0,1.0,3.0,1.0,1.0,2.0,3.0,0.0,0.0,7
1,2013,7,1,0.0,1.0,3.0,1.0,1.0,2.0,3.0,1.0,0.0,3
2,2013,7,1,0.0,1.0,3.0,1.0,1.0,2.0,4.0,0.0,0.0,6
3,2013,7,1,0.0,1.0,3.0,1.0,1.0,2.0,4.0,1.0,0.0,1
4,2013,7,1,0.0,1.0,3.0,1.0,1.0,2.0,4.0,1.0,1.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
9898,2013,7,3,0.0,1.0,4.0,3.0,0.0,2.0,2.0,0.0,0.0,5
9899,2013,7,3,0.0,1.0,4.0,3.0,0.0,2.0,4.0,1.0,1.0,1
9900,2013,7,3,0.0,1.0,4.0,3.0,0.0,2.0,1.0,0.0,0.0,7
9901,2013,7,3,0.0,1.0,4.0,3.0,0.0,2.0,1.0,1.0,0.0,2


In [78]:
df=df.drop(columns=['count','year'])

In [79]:
df

Unnamed: 0,age_group_5_years,race_eth,first_degree_hx,age_menarche,age_first_birth,BIRADS_breast_density,current_hrt,menopaus,bmi_group,biophx,breast_cancer_history
0,7,1,0.0,1.0,3.0,1.0,1.0,2.0,3.0,0.0,0.0
1,7,1,0.0,1.0,3.0,1.0,1.0,2.0,3.0,1.0,0.0
2,7,1,0.0,1.0,3.0,1.0,1.0,2.0,4.0,0.0,0.0
3,7,1,0.0,1.0,3.0,1.0,1.0,2.0,4.0,1.0,0.0
4,7,1,0.0,1.0,3.0,1.0,1.0,2.0,4.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
9898,7,3,0.0,1.0,4.0,3.0,0.0,2.0,2.0,0.0,0.0
9899,7,3,0.0,1.0,4.0,3.0,0.0,2.0,4.0,1.0,1.0
9900,7,3,0.0,1.0,4.0,3.0,0.0,2.0,1.0,0.0,0.0
9901,7,3,0.0,1.0,4.0,3.0,0.0,2.0,1.0,1.0,0.0


In [80]:
df['breast_cancer_history']=df['breast_cancer_history'].astype('int')

In [81]:
X = df[df.columns[:-1]].values
y = df["breast_cancer_history"].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=123)
ros = RandomOverSampler()
X_train,y_train= ros.fit_resample(X_train, y_train)



In [82]:
models = {
    'k-NN': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'AdaBoost': AdaBoostClassifier(random_state=123),
    'Gradient Boosting': GradientBoostingClassifier(random_state=123),
    'LDA': LinearDiscriminantAnalysis(),
    'Decision Tree': DecisionTreeClassifier(random_state=123),
    'Random Forest': RandomForestClassifier(random_state=123),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'CatBoost':CatBoostClassifier(iterations=100, learning_rate=0.1, depth=6)
}

In [83]:


# Initialize the results dictionary
model_results = {}

# Dictionary to store the results
results = {
    'Model': [],
    'CV_Score (Recall)': [],
    'Accuracy': [],
    'Precision': [],
    'F1 Score': []
}

# Define the recall scorer for cross-validation
recall_scorer = make_scorer(recall_score)

# Assuming you have defined your models (e.g., models = {'Model1': model1, 'Model2': model2, ...})
for name, model in models.items():
    # Train the model
    model.fit(X_train, y_train)
    
    # Predict on the train set
    y_pred = model.predict(X_train)
    
    # Perform cross-validation using recall as the scoring metric
    cv_score = np.mean(cross_val_score(model, X_train, y_train, cv=5, scoring=recall_scorer))
    
    # Append results to the dictionary
    results['Model'].append(name)
    results['CV_Score (Recall)'].append(cv_score)
    results['Accuracy'].append(accuracy_score(y_train, y_pred))
    results['Precision'].append(precision_score(y_train, y_pred))
    results['F1 Score'].append(f1_score(y_train, y_pred))
    
    # Store the classification report for each model
    model_results[name] = classification_report(y_train, y_pred)

results_df = pd.DataFrame(results)















Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.



Parameters: { "use_label_encoder" } are not used.




0:	learn: 0.6704624	total: 208ms	remaining: 20.6s
1:	learn: 0.6520529	total: 213ms	remaining: 10.5s
2:	learn: 0.6360098	total: 218ms	remaining: 7.06s
3:	learn: 0.6230970	total: 223ms	remaining: 5.34s
4:	learn: 0.6118905	total: 227ms	remaining: 4.32s
5:	learn: 0.6023401	total: 233ms	remaining: 3.64s
6:	learn: 0.5941659	total: 238ms	remaining: 3.16s
7:	learn: 0.5872789	total: 243ms	remaining: 2.79s
8:	learn: 0.5814903	total: 247ms	remaining: 2.5s
9:	learn: 0.5768862	total: 252ms	remaining: 2.27s
10:	learn: 0.5726298	total: 257ms	remaining: 2.08s
11:	learn: 0.5686690	total: 262ms	remaining: 1.92s
12:	learn: 0.5656944	total: 267ms	remaining: 1.78s
13:	learn: 0.5629739	total: 272ms	remaining: 1.67s
14:	learn: 0.5605073	total: 276ms	remaining: 1.57s
15:	learn: 0.5582530	total: 281ms	remaining: 1.48s
16:	learn: 0.5563641	total: 286ms	remaining: 1.4s
17:	learn: 0.5546559	total: 291ms	remaining: 1.32s
18:	learn: 0.5530025	total: 297ms	remaining: 1.27s
19:	learn: 0.5516376	total: 303ms	remaining

In [84]:
results_df = pd.DataFrame(results)

results_df.style.background_gradient(cmap='bwr', axis=0)

Unnamed: 0,Model,CV_Score (Recall),Accuracy,Precision,F1 Score
0,k-NN,0.514273,0.696426,0.845619,0.61287
1,Naive Bayes,0.76915,0.736195,0.721589,0.744611
2,AdaBoost,0.783066,0.739124,0.719688,0.750175
3,Gradient Boosting,0.785117,0.747107,0.729524,0.756436
4,LDA,0.797567,0.737659,0.711732,0.752795
5,Decision Tree,0.897902,0.844368,0.79767,0.855688
6,Random Forest,0.913282,0.844368,0.790821,0.857488
7,XGBoost,0.857184,0.81705,0.782609,0.827558
8,CatBoost,0.801083,0.763073,0.743955,0.772006
