<a href="https://colab.research.google.com/github/thoriqnaja/project-UAS/blob/main/Proyek_akhir_UAS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import warnings
import os

import numpy as np  # NumPy for numerical computing
import pandas as pd  # Pandas for data manipulation and analysis

# Data visualization libraries
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

warnings.simplefilter(action='ignore', category=Warning)

# Some Pandas settings
pd.set_option('display.max_columns', None)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
pd.set_option('display.width', 500)

In [None]:
df = pd.read_csv("/kaggle/input/d/mexwell/heart-disease-dataset/heart_statlog_cleveland_hungary_final.csv")

In [None]:
def check_df(dataframe):
    print("##################### Row and Column Count #####################")
    print(dataframe.shape)
    print("\n##################### Column Names #####################")
    print(dataframe.columns)
    print("\n##################### First Five Rows #####################")
    print(dataframe.head())
    print("\n##################### Last Five Rows #####################")
    print(dataframe.tail())
    print("\n##################### DataFrame Information #####################")
    dataframe.info()
    print("\n##################### Data Types #####################")
    print(dataframe.dtypes)

check_df(df)

In [None]:
print(df.describe().T)

In [None]:
def plot_numerical_col(dataframe, numerical_col):
    dataframe[numerical_col].hist(bins=20)
    plt.xlabel(numerical_col)
    plt.show(block=True)

cols = [col for col in df.columns]

for col in cols:
    plot_numerical_col(df, col)

In [None]:
for col in cols:
    print(f"\n{col} Value Counts (%):")
    print(100 * df[col].value_counts() / len(df))

In [None]:
print(df.isnull().sum())

In [None]:
def outlier_thresholds(dataframe, col_name, q1=0.05, q3=0.95):

    quartile1 = dataframe[col_name].quantile(q1)
    quartile3 = dataframe[col_name].quantile(q3)
    interquantile_range = quartile3 - quartile1
    up_limit = quartile3 + 1.5 * interquantile_range
    low_limit = quartile1 - 1.5 * interquantile_range
    return low_limit, up_limit

def check_outlier(dataframe, col_name):

    low_limit, up_limit = outlier_thresholds(dataframe, col_name)
    if dataframe[(dataframe[col_name] > up_limit) | (dataframe[col_name] < low_limit)].any(axis=None):
        return True
    else:
        return False

for col in cols:
    # Look for every column
    print(col, check_outlier(df, col))

In [None]:
def replace_with_thresholds(dataframe, variable):
    low_limit, up_limit = outlier_thresholds(dataframe, variable)
    dataframe.loc[(dataframe[variable] < low_limit), variable] = low_limit
    dataframe.loc[(dataframe[variable] > up_limit), variable] = up_limit

replace_with_thresholds(df, "resting bp s")

In [None]:
for col in cols:
    print(col, check_outlier(df, col))

In [None]:
print("\n##################### Mean #####################")
print(df.mean())

# Standart Sapma (Standard Deviation)
print("\n##################### Standard Deviation #####################")
print(df.std())

# Medyan (Median)
print("\n##################### Median #####################")
print(df.median())

# Mod (Mode)
print("\n##################### Mode #####################")
print(df.mode().iloc[0])

# İstatistiksel Özet (Summary Statistics)
print("\n##################### Summary Statistics #####################")
print(df.describe(include='all'))

In [None]:
mean_values = df.mean()
median_values = df.median()
mode_values = df.mode().iloc[0]
std_values = df.std()

for column in df.select_dtypes(include=['float64', 'int64']).columns:
    plt.figure(figsize=(10, 4))
    sns.histplot(df[column], kde=True, color='blue')
    plt.axvline(mean_values[column], color='red', linestyle='--', label=f'Mean: {mean_values[column]:.2f}')
    plt.axvline(median_values[column], color='green', linestyle='-', label=f'Median: {median_values[column]:.2f}')
    plt.axvline(mode_values[column], color='orange', linestyle='-', label=f'Mode: {mode_values[column]:.2f}')
    plt.axvline(std_values[column], color='purple', linestyle='-', label=f'Std: {std_values[column]:.2f}')
    plt.title(f'Distribution of {column}')
    plt.legend()
    plt.show()

In [None]:
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    plt.figure(figsize=(10, 4))
    sns.scatterplot(x=df.index, y=df[column], alpha=0.5, color='blue')

    plt.title(f'Scatter Plot of {column}')
    plt.xlabel('Index')
    plt.ylabel(column)
    plt.show()

In [None]:
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(df[column], kde=True)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

In [None]:
for column in df.select_dtypes(include=['float64', 'int64']).columns:
    plt.figure(figsize=(10, 6))
    sns.boxplot(x=df[column])
    plt.title(f'Box Plot of {column}')
    plt.xlabel(column)
    plt.show()

In [None]:
# Calculate correlation matrix
corr_matrix = df.corr()

# Visualize correlation matrix
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

In [None]:
def target_summary_with_num(dataframe, target, numerical_col):
    print(dataframe.groupby(target).agg({numerical_col: "mean"}), end="\n\n\n")


for col in cols:
    target_summary_with_num(df, "target", col)

In [None]:
high_corr_features = corr_matrix['target'][abs(corr_matrix['target']) > 0.4].index.tolist()
high_corr_features.remove('target')
print(f"Selected features: {high_corr_features}")

In [None]:
from sklearn.preprocessing import RobustScaler
for col in cols:
    df[col] = RobustScaler().fit_transform(df[[col]])

In [None]:
df.head(15)

In [None]:
df2 = df # First, copy the file
corr_matrix2 = df2.corr() # correlation map
target_variable = 'target' # our target variable

# calculate variables with correlation greater than 0.4
high_corr_features = corr_matrix2['target'][abs(corr_matrix2['target']) > 0.4].index.tolist()

# remove target variable itself
high_corr_features.remove('target')

# check
print(f"Selected features: {high_corr_features}")

# Filter the dataset to keep only high correlation features.
df2 = df2[high_corr_features + ['target']]

In [None]:
for column in df2.select_dtypes(include=['float64', 'int64']).columns:
    plt.figure(figsize=(10, 6))
    sns.histplot(df2[column], kde=True)
    plt.title(f'Histogram of {column}')
    plt.xlabel(column)
    plt.ylabel('Frequency')
    plt.show()

# New section