In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from keras.models import Sequential
from keras.layers import Dense
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import glob
import os
import re
warnings.filterwarnings('ignore')

2024-08-10 07:31:37.575233: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-08-10 07:31:37.575353: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-08-10 07:31:37.738132: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# **Data Loading**

In [None]:
# For train data
folder_path = '/kaggle/input/attacks/csv/train'
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
dataframes = []
sampling_fraction = 3000_00 / 7_000_000
lst = ['Benign','MQTT-DDoS-Connect_Flood','Recon-Port_Scan','MQTT-DoS-Publish_Flood','MQTT-DDoS-Publish_Flood','Recon-OS_Scan','ARP_Spoofing','MQTT-DoS-Connect_Flood','MQTT-Malformed_Data','Recon-VulScan','Recon-Ping_Sweep']
for file in csv_files:
    df = pd.read_csv(file, skiprows=lambda i: i>0 and np.random.random() > sampling_fraction)
    attack_category = os.path.basename(file)[:-15] # Ensure correct slicing based on file name format
    if attack_category not in lst:
        attack_category = attack_category[:-1]
    df['Attack_Category'] = attack_category
    dataframes.append(df)
merged_data = pd.concat(dataframes, ignore_index=True)

In [None]:
print("Shape of the combined DataFrame (train):", merged_data.shape)
print(merged_data['Attack_Category'].value_counts())

In [None]:
folder_path = '/kaggle/input/attacks/csv/test'
csv_files = glob.glob(os.path.join(folder_path, '*.csv'))
dataframes = []
sampling_fraction = 200000 / 1_600_000
lst = ['TCP_IP-DDoS-UDP1','TCP_IP-DDoS-UDP2','TCP_IP-DDoS-UDP2','TCP_IP-DDoS-ICMP1','TCP_IP-DDoS-ICMP2']
for file in csv_files:
    df = pd.read_csv(file, skiprows=lambda i: i>0 and np.random.random() > sampling_fraction)
    attack_category = os.path.basename(file)[:-14] # Ensure correct slicing based on file name format
    if attack_category in lst:
        attack_category = attack_category[:-1]
    df['Attack_Category'] = attack_category
    dataframes.append(df)

test_data = pd.concat(dataframes, ignore_index=True)
print("Shape of the combined DataFrame:", test_data.shape)
print(test_data['Attack_Category'].value_counts())

In [None]:
plt.figure(figsize=(12, 8))
ax = sns.countplot(x='Attack_Category', data=merged_data, palette='viridis')
plt.title('Distribution of Different Types of Cyber Attacks', fontsize=16)
plt.xlabel('Attack Category', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(rotation=45)
plt.tight_layout()

In [None]:
# Scatter plot with bubble size
plt.figure(figsize=(12, 8))
bubble_size = merged_data['Weight'] / merged_data['Weight'].max() * 1000  # Normalizing the 'Weight' for bubble size
scatter = plt.scatter('Duration', 'Rate', s=bubble_size, alpha=0.5, data=merged_data, c='Drate', cmap='spring')
plt.legend(*scatter.legend_elements("sizes", num=6), title='Weight')
plt.colorbar(scatter)
plt.title('Duration vs Rate Colored by Drate with Weight Indication', fontsize=16)
plt.xlabel('Duration', fontsize=14)
plt.ylabel('Rate', fontsize=14)
plt.show()

# **EDA**

In [None]:
merged_data.describe()

In [None]:
merged_data.info()

In [None]:
sample_data = merged_data.sample(n=10000, random_state=1)  # Random state for reproducibility
plt.figure(figsize=(10, 6))
sns.scatterplot(data=sample_data, x='Duration', y='Rate', color= 'red')
plt.title('Scatter Plot of Duration vs Rate')
plt.xlabel('Duration')
plt.ylabel('Rate')
plt.show()


In [None]:
plt.figure(figsize=(12, 8))
sns.boxplot(x='Attack_Category', y='Duration', data=merged_data)
plt.xticks(rotation=45)
plt.title('Box Plot of Duration by Attack Category')
plt.xlabel('Attack Category')
plt.ylabel('Duration')
plt.show()

In [None]:
sns.pairplot(sample_data[['Duration', 'Rate', 'Srate', 'Drate']], diag_kind='kde')
plt.show()

# **Data Pre-Processing**

In [None]:
print(f"Total Duplicate Values are : {sum(merged_data.duplicated())}")
print(f"Total Null Values are : {sum(merged_data.isnull().sum())}")
merged_data.drop_duplicates(inplace=True)
print(f"Total Duplicate Values are : {sum(merged_data.duplicated())}")

print(f"Total Duplicate Values in Testing Data are : {sum(test_data.duplicated())}")
print(f"Total Null Values in Testing Data are : {sum(test_data.isnull().sum())}")
test_data.drop_duplicates(inplace=True)
print(f"Total Duplicate Values are : {sum(test_data.duplicated())}")

In [None]:
def detect_outliers(data, feature):
    Q1 = data[feature].quantile(0.25)
    Q3 = data[feature].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data[feature] < lower_bound) | (data[feature] > upper_bound)]
    return outliers

outliers_duration = detect_outliers(merged_data, 'Duration')
print(f"Number of outliers in Duration: {outliers_duration.shape[0]}")
print("Shape of data before removing outliers:", merged_data.shape)
merged_data = merged_data[~merged_data.index.isin(outliers_duration.index)]
print("Shape of data after removing outliers:", merged_data.shape)

# **Encoding**

In [None]:
X_train = merged_data.drop('Attack_Category', axis=1)
y_train = merged_data['Attack_Category']
X_test = test_data.drop('Attack_Category', axis=1)
y_test = test_data['Attack_Category']

In [None]:
scaler = StandardScaler()
features_to_scale = merged_data.columns.difference(['Attack_Category'])
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)

# **Correlation Analysis**

In [None]:
correlation_matrix = merged_data.select_dtypes(include=[np.number]).corr()
correlation_matrix

In [None]:
# Extract the upper triangle of the correlation matrix
upper_triangle = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape), k=1).astype(np.bool_))

# Find the top N correlated features
top_correlated_features = upper_triangle.unstack().abs().sort_values(ascending=False).dropna().head(10)

# Collect unique features from the top correlations
unique_features = set()
for index in top_correlated_features.index:
    unique_features.add(index[0])
    unique_features.add(index[1])
unique_features = list(unique_features)

# Plot the heatmap of the top N correlated features
plt.figure(figsize=(10, 6))  # Adjust the size accordingly
sns.heatmap(correlation_matrix.loc[unique_features, unique_features], annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)
plt.xticks(rotation=45, ha='right')
plt.yticks(rotation=0)
plt.title('Top 10 Correlated Features in Dataset')
plt.show()

# **Modelling**

## **Logistic Regression**

In [None]:
logistic_model = LogisticRegression(max_iter=100)
logistic_model.fit(X_train, y_train)
y_pred_logistic = logistic_model.predict(X_test)
acc_logistic = accuracy_score(y_test, y_pred_logistic)
print("Logistic Regression Accuracy:", acc_logistic)
print(classification_report(y_test, y_pred_logistic))

In [None]:
# Example metrics for each model
metrics_logistic = {'Accuracy': 0.85, 'Precision': 0.84, 'Recall': 0.83, 'F1-Score': 0.84}
metrics_decision_tree = {'Accuracy': 0.80, 'Precision': 0.79, 'Recall': 0.78, 'F1-Score': 0.79}
metrics_random_forest = {'Accuracy': 0.88, 'Precision': 0.89, 'Recall': 0.87, 'F1-Score': 0.88}
metrics_ann = {'Accuracy': 0.90, 'Precision': 0.91, 'Recall': 0.89, 'F1-Score': 0.90}
import pandas as pd

# Create a DataFrame
data = {
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'ANN'],
    'Accuracy': [metrics_logistic['Accuracy'], metrics_decision_tree['Accuracy'], 
                 metrics_random_forest['Accuracy'], metrics_ann['Accuracy']],
    'Precision': [metrics_logistic['Precision'], metrics_decision_tree['Precision'], 
                  metrics_random_forest['Precision'], metrics_ann['Precision']],
    'Recall': [metrics_logistic['Recall'], metrics_decision_tree['Recall'], 
               metrics_random_forest['Recall'], metrics_ann['Recall']],
    'F1-Score': [metrics_logistic['F1-Score'], metrics_decision_tree['F1-Score'], 
                 metrics_random_forest['F1-Score'], metrics_ann['F1-Score']]
}
df = pd.DataFrame(data)
df_melted = df.melt(id_vars=["Model"], var_name="Metric", value_name="Value")
import seaborn as sns
import matplotlib.pyplot as plt

# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Initialize the matplotlib figure
plt.figure(figsize=(12, 8))

# Create a bar plot
ax = sns.barplot(x='Metric', y='Value', hue='Model', data=df_melted, palette='viridis')

# Add the data labels on top of the bars
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.2f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 9), 
                textcoords = 'offset points')

# Final touches
plt.title('Comparison of Machine Learning Models Across Multiple Metrics', fontsize=16)
plt.xlabel('Metric', fontsize=14)
plt.ylabel('Value', fontsize=14)
plt.legend(title='Model')

# Show plot
plt.show()


## **Decision Tree**

In [None]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)
y_pred_dt = dt_model.predict(X_test)
acc_dt = accuracy_score(y_test, y_pred_dt)
print("Decision Tree Accuracy:", acc_dt)
print(classification_report(y_test, y_pred_dt, target_names=label_encoder.classes_))

## **Random Forest**

In [None]:
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
acc_rf = accuracy_score(y_test, y_pred_rf)
print("Random Forest Accuracy:", acc_rf)
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

## **ANN Architecture**

In [None]:
model = Sequential()
model.add(Dense(units=128, activation='relu', input_shape=(X_train.shape[1],)))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=len(np.unique(y_train)), activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])


In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=128, validation_split=0.2)

In [None]:
y_pred_prob = model.predict(X_test)
y_pred = np.argmax(y_pred_prob, axis=1)

acc_ann = accuracy_score(y_test, y_pred)
print("ANN Accuracy:", acc_ann)
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

In [None]:
# Assuming you've trained the models and made predictions already as per your previous message
report_logistic = classification_report(y_test, y_pred_logistic, output_dict=True)
report_dt = classification_report(y_test, y_pred_dt, output_dict=True)
report_rf = classification_report(y_test, y_pred_rf, output_dict=True)
report_ann = classification_report(y_test, y_pred, output_dict=True)  # For ANN mode
# Function to extract the average scores for the metrics
def extract_metrics(report):
    return {
        'Accuracy': report['accuracy'],
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    }

# Get metrics
metrics_logistic = extract_metrics(report_logistic)
metrics_dt = extract_metrics(report_dt)
metrics_rf = extract_metrics(report_rf)
metrics_ann = extract_metrics(report_ann)

# Create a DataFrame
data = {
    'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'ANN'],
    'Accuracy': [metrics_logistic['Accuracy'], metrics_dt['Accuracy'], metrics_rf['Accuracy'], metrics_ann['Accuracy']],
    'Precision': [metrics_logistic['Precision'], metrics_dt['Precision'], metrics_rf['Precision'], metrics_ann['Precision']],
    'Recall': [metrics_logistic['Recall'], metrics_dt['Recall'], metrics_rf['Recall'], metrics_ann['Recall']],
    'F1-Score': [metrics_logistic['F1-Score'], metrics_dt['F1-Score'], metrics_rf['F1-Score'], metrics_ann['F1-Score']]
}
df = pd.DataFrame(data)
df_melted = df.melt(id_vars=["Model"], var_name="Metric", value_name="Value")
# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Initialize the matplotlib figure
plt.figure(figsize=(12, 8))

# Create a bar plot
ax = sns.barplot(x='Metric', y='Value', hue='Model', data=df_melted, palette='viridis')

# Add data labels
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.2f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 9), 
                textcoords = 'offset points')

# Final touches
plt.title('Comparison of Machine Learning Models Across Multiple Metrics', fontsize=16)
plt.xlabel('Metric', fontsize=14)
plt.ylabel('Value', fontsize=14)
plt.legend(title='Model', loc='center right')

# Show plot
plt.show()

In [None]:
# Assuming rf_model is a trained RandomForestClassifier
importances = rf_model.feature_importances_
feature_names = merged_data.drop('Attack_Category', axis=1).columns

# Create a DataFrame of features and their importance
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sort the DataFrame by importance in descending order
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Select the top 10 features
top_features = importance_df.head(10)
# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Initialize the matplotlib figure
plt.figure(figsize=(10, 6))

# Create a bar plot for the top 10 features
sns.barplot(x='Importance', y='Feature', data=top_features, palette='viridis')

# Add labels and title
plt.title('Top 10 Feature Importances in Random Forest Model', fontsize=16)
plt.xlabel('Importance', fontsize=14)
plt.ylabel('Feature', fontsize=14)

# Show plot
plt.show()

In [None]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense
from tensorflow.keras.utils import to_categorical

# Convert labels to one-hot encoding
y_train_onehot = to_categorical(y_train)
y_test_onehot = to_categorical(y_test)

# Reshape input data for RNN (samples, time steps, features)
X_train_rnn = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_rnn = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Build the RNN model
rnn_model = Sequential([
    SimpleRNN(64, input_shape=(1, X_train.shape[1]), activation='relu'),
    Dense(32, activation='relu'),
    Dense(y_train_onehot.shape[1], activation='softmax')
])

rnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
rnn_history = rnn_model.fit(X_train_rnn, y_train_onehot, epochs=5, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
y_pred_rnn = rnn_model.predict(X_test_rnn)
y_pred_rnn_classes = np.argmax(y_pred_rnn, axis=1)
acc_rnn = accuracy_score(y_test, y_pred_rnn_classes)
print("RNN Accuracy:", acc_rnn)
print(classification_report(y_test, y_pred_rnn_classes, target_names=label_encoder.classes_))

In [None]:
from tensorflow.keras.layers import LSTM

# Reshape input data for LSTM (samples, time steps, features)
X_train_lstm = X_train.reshape((X_train.shape[0], 1, X_train.shape[1]))
X_test_lstm = X_test.reshape((X_test.shape[0], 1, X_test.shape[1]))

# Build the LSTM model
lstm_model = Sequential([
    LSTM(64, input_shape=(1, X_train.shape[1]), activation='relu', return_sequences=True),
    LSTM(32, activation='relu'),
    Dense(16, activation='relu'),
    Dense(y_train_onehot.shape[1], activation='softmax')
])

lstm_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
lstm_history = lstm_model.fit(X_train_lstm, y_train_onehot, epochs=5, batch_size=32, validation_split=0.2, verbose=1)

# Evaluate the model
y_pred_lstm = lstm_model.predict(X_test_lstm)
y_pred_lstm_classes = np.argmax(y_pred_lstm, axis=1)
acc_lstm = accuracy_score(y_test, y_pred_lstm_classes)
print("LSTM Accuracy:", acc_lstm)
print(classification_report(y_test, y_pred_lstm_classes, target_names=label_encoder.classes_))

In [None]:

from sklearn.metrics import classification_report
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Assuming you've trained the models and made predictions already
report_rnn = classification_report(y_test, y_pred_rnn_classes, output_dict=True)
report_lstm = classification_report(y_test, y_pred_lstm_classes, output_dict=True)

# Function to extract the average scores for the metrics
def extract_metrics(report):
    return {
        'Accuracy': report['accuracy'],
        'Precision': report['weighted avg']['precision'],
        'Recall': report['weighted avg']['recall'],
        'F1-Score': report['weighted avg']['f1-score']
    }

# Get metrics
metrics_rnn = extract_metrics(report_rnn)
metrics_lstm = extract_metrics(report_lstm)

# Create a DataFrame
data = {
    'Model': ['RNN', 'LSTM'],
    'Accuracy': [metrics_rnn['Accuracy'], metrics_lstm['Accuracy']],
    'Precision': [metrics_rnn['Precision'], metrics_lstm['Precision']],
    'Recall': [metrics_rnn['Recall'], metrics_lstm['Recall']],
    'F1-Score': [metrics_rnn['F1-Score'], metrics_lstm['F1-Score']]
}

df = pd.DataFrame(data)
df_melted = df.melt(id_vars=["Model"], var_name="Metric", value_name="Value")

# Set the aesthetic style of the plots
sns.set(style="whitegrid")

# Initialize the matplotlib figure
plt.figure(figsize=(12, 8))

# Create a bar plot
ax = sns.barplot(x='Metric', y='Value', hue='Model', data=df_melted, palette='viridis')

# Add data labels
for p in ax.patches:
    ax.annotate(format(p.get_height(), '.4f'), 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha = 'center', va = 'center', 
                xytext = (0, 9), 
                textcoords = 'offset points',
                fontsize=10)

# Final touches
plt.title('Comparison of Deep Learning Models, RNN and LSTM , Across Multiple Metrics', fontsize=16)
plt.xlabel('Metric', fontsize=14)
plt.ylabel('Value', fontsize=14)
plt.legend(title='Model', loc='upper right')

# Adjust layout to prevent cutting off labels
plt.tight_layout()

# Show plot
plt.show()


# from sklearn.metrics import classification_report
# import pandas as pd
# import seaborn as sns
# import matplotlib.pyplot as plt

# # Assuming you've trained the models and made predictions already
# report_logistic = classification_report(y_test, y_pred_logistic, output_dict=True)
# report_dt = classification_report(y_test, y_pred_dt, output_dict=True)
# report_rf = classification_report(y_test, y_pred_rf, output_dict=True)
# report_ann = classification_report(y_test, y_pred, output_dict=True)  # For ANN model
# report_rnn = classification_report(y_test, y_pred_rnn_classes, output_dict=True)
# report_lstm = classification_report(y_test, y_pred_lstm_classes, output_dict=True)

# # Function to extract the average scores for the metrics
# def extract_metrics(report):
#     return {
#         'Accuracy': report['accuracy'],
#         'Precision': report['weighted avg']['precision'],
#         'Recall': report['weighted avg']['recall'],
#         'F1-Score': report['weighted avg']['f1-score']
#     }

# # Get metrics
# metrics_logistic = extract_metrics(report_logistic)
# metrics_dt = extract_metrics(report_dt)
# metrics_rf = extract_metrics(report_rf)
# metrics_ann = extract_metrics(report_ann)
# metrics_rnn = extract_metrics(report_rnn)
# metrics_lstm = extract_metrics(report_lstm)

# # Create a DataFrame
# data = {
#     'Model': ['Logistic Regression', 'Decision Tree', 'Random Forest', 'ANN', 'RNN', 'LSTM'],
#     'Accuracy': [metrics_logistic['Accuracy'], metrics_dt['Accuracy'], metrics_rf['Accuracy'], 
#                  metrics_ann['Accuracy'], metrics_rnn['Accuracy'], metrics_lstm['Accuracy']],
#     'Precision': [metrics_logistic['Precision'], metrics_dt['Precision'], metrics_rf['Precision'], 
#                   metrics_ann['Precision'], metrics_rnn['Precision'], metrics_lstm['Precision']],
#     'Recall': [metrics_logistic['Recall'], metrics_dt['Recall'], metrics_rf['Recall'], 
#                metrics_ann['Recall'], metrics_rnn['Recall'], metrics_lstm['Recall']],
#     'F1-Score': [metrics_logistic['F1-Score'], metrics_dt['F1-Score'], metrics_rf['F1-Score'], 
#                  metrics_ann['F1-Score'], metrics_rnn['F1-Score'], metrics_lstm['F1-Score']]
# }

# df = pd.DataFrame(data)
# df_melted = df.melt(id_vars=["Model"], var_name="Metric", value_name="Value")

# # Set the aesthetic style of the plots
# sns.set(style="whitegrid")

# # Initialize the matplotlib figure
# plt.figure(figsize=(15, 10))

# # Create a bar plot
# ax = sns.barplot(x='Metric', y='Value', hue='Model', data=df_melted, palette='viridis')

# # Add data labels
# for p in ax.patches:
#     ax.annotate(format(p.get_height(), '.2f'), 
#                 (p.get_x() + p.get_width() / 2., p.get_height()), 
#                 ha = 'center', va = 'center', 
#                 xytext = (0, 9), 
#                 textcoords = 'offset points',
#                 fontsize=8)

# # Final touches
# plt.title('Comparison of Machine Learning Models Across Multiple Metrics', fontsize=16)
# plt.xlabel('Metric', fontsize=14)
# plt.ylabel('Value', fontsize=14)
# plt.legend(title='Model', loc='upper right', bbox_to_anchor=(1.25, 1))

# # Adjust layout to prevent cutting off labels
# plt.tight_layout()

# # Show plot
# plt.show()

In [None]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA

# Elbow curve method
inertias = []
silhouette_scores = []
K = range(2, 11)  # We'll test from 2 to 10 clusters

for k in K:
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X_train)
    inertias.append(kmeans.inertia_)
    
    # Silhouette score
    labels = kmeans.labels_
    silhouette_scores.append(silhouette_score(X_train, labels))

# Plot the elbow curve
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(K, inertias, 'bx-')
plt.xlabel('k')
plt.ylabel('Inertia')
plt.title('Elbow Method for Optimal k')

# Plot the silhouette scores
plt.subplot(1, 2, 2)
plt.plot(K, silhouette_scores, 'rx-')
plt.xlabel('k')
plt.ylabel('Silhouette Score')
plt.title('Silhouette Score for Optimal k')

plt.tight_layout()
plt.show()

# Choose the optimal k based on the elbow curve and silhouette scores
optimal_k = 5  # You should adjust this based on the elbow curve and silhouette scores

# Perform K-means clustering with the optimal k
kmeans = KMeans(n_clusters=optimal_k, random_state=42)
kmeans_labels = kmeans.fit_predict(X_train)

# Evaluate the clustering
silhouette_avg = silhouette_score(X_train, kmeans_labels)
print(f"The average silhouette score for K-means clustering with {optimal_k} clusters: {silhouette_avg}")

# Visualize the clustering (using first two principal components for 2D visualization)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_train)

plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis')
plt.title(f'K-means Clustering Visualization (k={optimal_k})')
plt.colorbar(scatter)
plt.show()
# from sklearn.cluster import KMeans
# from sklearn.metrics import silhouette_score
# import matplotlib.pyplot as plt

# # Assuming X_train is already scaled. If not, uncomment the following lines:
# # from sklearn.preprocessing import StandardScaler
# # scaler = StandardScaler()
# # X_train_scaled = scaler.fit_transform(X_train)

# # K-means clustering
# n_clusters = 5  # You can adjust this based on your domain knowledge or using methods like elbow curve

# kmeans = KMeans(n_clusters=n_clusters, random_state=42)
# kmeans_labels = kmeans.fit_predict(X_train)

# # Evaluate the clustering
# silhouette_avg = silhouette_score(X_train, kmeans_labels)
# print(f"The average silhouette score for K-means clustering: {silhouette_avg}")

# # Visualize the clustering (using first two principal components for 2D visualization)
# from sklearn.decomposition import PCA

# pca = PCA(n_components=2)
# X_pca = pca.fit_transform(X_train)

# plt.figure(figsize=(10, 8))
# scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=kmeans_labels, cmap='viridis')
# plt.title('K-means Clustering Visualization')
# plt.colorbar(scatter)
# plt.show()

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.neighbors import NearestNeighbors

# DBSCAN clustering
# First, let's find a good eps value using k-distance graph
neigh = NearestNeighbors(n_neighbors=2)
nbrs = neigh.fit(X_train)
distances, indices = nbrs.kneighbors(X_train)
distances = np.sort(distances, axis=0)
distances = distances[:,1]

plt.figure(figsize=(10, 8))
plt.plot(distances)
plt.title('K-distance Graph')
plt.xlabel('Data Points sorted by distance')
plt.ylabel('Epsilon')
plt.show()



In [None]:
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA

# Sample a subset of your data to reduce memory usage
sample_size = 50000  # Adjust this based on your available memory
indices = np.random.choice(X_train.shape[0], sample_size, replace=False)
X_sample = X_train[indices]

# Choose an epsilon value where the curve starts to elbow
eps = 0.5  # This is an example value, adjust based on your data
min_samples = 5  # Adjust this value based on your data

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan_labels = dbscan.fit_predict(X_sample)

# Use PCA for visualization (reduce to 2 components)
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_sample)

# Visualize the clustering
plt.figure(figsize=(10, 8))
scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=dbscan_labels, cmap='viridis')
plt.title('DBSCAN Clustering Visualization (Sample)')
plt.colorbar(scatter)
plt.show()

# Print the number of clusters formed (excluding noise if any)
n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
print(f"Number of clusters formed: {n_clusters}")

# from sklearn.cluster import DBSCAN
# from sklearn.neighbors import NearestNeighbors

# # Choose an epsilon value where the curve starts to elbow
# eps = 0.5  # This is an example value, adjust based on the k-distance graph
# min_samples = 5  # Adjust this value based on your data

# dbscan = DBSCAN(eps=eps, min_samples=min_samples)
# dbscan_labels = dbscan.fit_predict(X_train)

# # Evaluate the clustering
# silhouette_avg = silhouette_score(X_train, dbscan_labels)
# print(f"The average silhouette score for DBSCAN clustering: {silhouette_avg}")

# # Visualize the clustering
# plt.figure(figsize=(10, 8))
# scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=dbscan_labels, cmap='viridis')
# plt.title('DBSCAN Clustering Visualization')
# plt.colorbar(scatter)
# plt.show()

# # Print the number of clusters formed (excluding noise if any)
# n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
# print(f"Number of clusters formed: {n_clusters}")

In [None]:
import numpy as np
from sklearn.cluster import DBSCAN
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# Sample a subset of your data
sample_size = 50000  # Adjust based on your available memory
indices = np.random.choice(X_train.shape[0], sample_size, replace=False)
X_sample = X_train[indices]

# Adjust DBSCAN parameters
eps = 2.0  # Increased eps value
min_samples = 10  # Increased min_samples

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=eps, min_samples=min_samples)
dbscan_labels = dbscan.fit_predict(X_sample)

# Use t-SNE for visualization
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(X_sample)

# Visualize the clustering
plt.figure(figsize=(12, 10))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=dbscan_labels, cmap='viridis', alpha=0.7)
plt.colorbar(scatter)

# Highlight noise points
noise_mask = dbscan_labels == -1
plt.scatter(X_tsne[noise_mask, 0], X_tsne[noise_mask, 1], c='gray', alpha=0.1, s=10)

plt.title('DBSCAN Clustering Visualization (t-SNE)')
plt.xlabel('t-SNE feature 1')
plt.ylabel('t-SNE feature 2')
plt.show()

# Print the number of clusters formed (excluding noise)
n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
print(f"Number of clusters formed: {n_clusters}")