In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, classification_report
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
data = pd.read_csv('Zone.csv')

# Print the first few rows of the data
print("Initial Data:")
print(data.head())

# Check for missing values
print("Missing values in each column:")
print(data.isnull().sum())

# Drop rows with missing values (if any)
data.dropna(inplace=True)

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

data

Initial Data:
   Location   Latitude  Longitude       SO2       NO2  Temperature   Humidity  \
0  Chakulia  22.336599    86.4673  0.000213  0.000048   287.770193  49.600002   
1  Chakulia  22.336599    86.4673  0.000268  0.000058   288.615841  48.861889   
2  Chakulia  22.336599    86.4673  0.000173  0.000050   289.148186  52.008862   
3  Chakulia  22.336599    86.4673  0.000605  0.000048   289.769308  43.066109   
4  Chakulia  22.336599    86.4673       NaN  0.000061   289.768827  47.792675   

        Class  
0  Industrial  
1  Industrial  
2  Industrial  
3  Industrial  
4  Industrial  
Missing values in each column:
Location          0
Latitude          0
Longitude         0
SO2            2227
NO2            2093
Temperature       0
Humidity          3
Class             0
dtype: int64


Unnamed: 0,Location,Latitude,Longitude,SO2,NO2,Temperature,Humidity,Class
0,Dumaria,22.246800,86.772797,-0.000205,0.000017,299.094672,71.400002,NonIndustrial
1,Dumaria,22.246800,86.772797,0.000684,0.000049,293.922546,52.600002,NonIndustrial
2,Ghatshila,22.399500,86.458397,0.000144,0.000026,302.573057,90.700005,NonIndustrial
3,Ghatshila,22.399500,86.458397,-0.000084,0.000023,292.684265,31.830709,NonIndustrial
4,Chakulia,22.336599,86.467300,-0.000525,0.000026,295.308222,83.900002,Industrial
...,...,...,...,...,...,...,...,...
2537,Chakulia,22.336599,86.467300,-0.000397,0.000039,293.766265,51.299999,Industrial
2538,Ghatshila,22.399500,86.458397,0.000354,0.000031,300.560657,79.090767,NonIndustrial
2539,Dumaria,22.246800,86.772797,0.000972,0.000044,300.349666,96.599998,NonIndustrial
2540,Dumaria,22.246800,86.772797,-0.000189,0.000037,293.260916,47.200001,NonIndustrial


In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, classification_report
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
import seaborn as sns

# Load the data
file_path = 'Zone.csv'  # Update this path to match your local file path
data = pd.read_csv(file_path)

# Inspect the first few rows of the data
print("Initial Data:")
print(data.head())

# Print the column names
print("Column names:")
print(data.columns)

# Check for missing values
print("Missing values in each column:")
print(data.isnull().sum())

# Drop rows with missing values (if any)
data.dropna(inplace=True)

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# Separate the features and target
X = data.drop('Location', axis=1)
y = data['Location']

# Apply one-hot encoding to the 'Location' column
encoder = OneHotEncoder(sparse=False)
y_encoded = encoder.fit_transform(y.values.reshape(-1, 1))

# Split the data into training and testing sets, stratifying by the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42, stratify=y)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train a RandomForest classifier
classifier = RandomForestClassifier(random_state=42)
classifier.fit(X_train, y_train)

# Make predictions
y_pred = classifier.predict(X_test)
y_pred_proba = classifier.predict_proba(X_test)

# Convert one-hot encoded y_test back to original labels for comparison
y_test_labels = encoder.inverse_transform(y_test)
y_pred_labels = encoder.inverse_transform(y_pred)

# Calculate accuracy
accuracy = accuracy_score(y_test_labels, y_pred_labels)
print(f"Accuracy: {accuracy}")

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
print(f"ROC AUC: {roc_auc}")

# Confusion matrix
conf_matrix = confusion_matrix(y_test_labels, y_pred_labels)
print("Confusion Matrix:")
print(conf_matrix)

# Classification report
class_report = classification_report(y_test_labels, y_pred_labels)
print("Classification Report:")
print(class_report)

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=encoder.categories_[0], yticklabels=encoder.categories_[0])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# Plot ROC Curve for each class
fpr = {}
tpr = {}
thresh ={}

for i in range(len(encoder.categories_[0])):
    fpr[i], tpr[i], thresh[i] = roc_curve(y_test[:, i], y_pred_proba[:, i])

plt.figure(figsize=(10, 7))
for i in range(len(encoder.categories_[0])):
    plt.plot(fpr[i], tpr[i], linestyle='--', label=f'Class {encoder.categories_[0][i]} vs Rest')

plt.plot([0, 1], [0, 1], color='black')
plt.title('Multiclass ROC curve')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive rate')
plt.legend(loc='best')
plt.show()


Initial Data:
   Location   Latitude  Longitude       SO2       NO2  Temperature   Humidity  \
0  Chakulia  22.336599    86.4673  0.000213  0.000048   287.770193  49.600002   
1  Chakulia  22.336599    86.4673  0.000268  0.000058   288.615841  48.861889   
2  Chakulia  22.336599    86.4673  0.000173  0.000050   289.148186  52.008862   
3  Chakulia  22.336599    86.4673  0.000605  0.000048   289.769308  43.066109   
4  Chakulia  22.336599    86.4673       NaN  0.000061   289.768827  47.792675   

        Class  
0  Industrial  
1  Industrial  
2  Industrial  
3  Industrial  
4  Industrial  
Column names:
Index(['Location', 'Latitude', 'Longitude', 'SO2', 'NO2', 'Temperature',
       'Humidity', 'Class'],
      dtype='object')
Missing values in each column:
Location          0
Latitude          0
Longitude         0
SO2            2227
NO2            2093
Temperature       0
Humidity          3
Class             0
dtype: int64




ValueError: could not convert string to float: 'NonIndustrial'

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns


# Train a RandomForest classifier
rf_classifier = RandomForestClassifier(random_state=42)
rf_classifier.fit(X_train, y_train)

# Train a Decision Tree classifier
dt_classifier = DecisionTreeClassifier(random_state=42)
dt_classifier.fit(X_train, y_train)



# Function to evaluate model
def evaluate_model(classifier, X_test, y_test, model_name):
    y_pred = classifier.predict(X_test)
    y_pred_proba = classifier.predict_proba(X_test)

    # Calculate accuracy
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{model_name} Accuracy: {accuracy}')

    # Generate confusion matrix
    conf_matrix = confusion_matrix(y_test, y_pred)
    print(f"{model_name} Confusion Matrix:")
    print(conf_matrix)

    # Calculate ROC AUC
    roc_auc = roc_auc_score(y_test, y_pred_proba, multi_class='ovr')
    print(f'{model_name} ROC AUC: {roc_auc}')

    # Plot confusion matrix
    plt.figure(figsize=(10, 7))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
    plt.title(f'{model_name} Confusion Matrix')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    # Plot ROC curve
    fpr = {}
    tpr = {}
    n_class = y.nunique()

    for i in range(n_class):    
        fpr[i], tpr[i], _ = roc_curve(y_test, y_pred_proba[:,i], pos_label=i)
    
    plt.figure(figsize=(10, 7))
    for i in range(n_class):
        plt.plot(fpr[i], tpr[i], linestyle='--', label=f'Class {i} vs Rest')
    plt.plot([0, 1], [0, 1], 'b--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title(f'{model_name} Multiclass ROC Curve')
    plt.legend(loc='best')
    plt.show()

    # Print classification report
    print(f"{model_name} Classification Report:")
    print(classification_report(y_test, y_pred))

# Evaluate RandomForest model
evaluate_model(rf_classifier, X_test, y_test, "RandomForest")

# Evaluate DecisionTree model
evaluate_model(dt_classifier, X_test, y_test, "DecisionTree")


RandomForest Accuracy: 1.0
RandomForest Confusion Matrix:
[[246   0]
 [  0 517]]


ValueError: y should be a 1d array, got an array of shape (763, 2) instead.

In [13]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, roc_auc_score, roc_curve, classification_report
from sklearn.neural_network import MLPClassifier
import matplotlib.pyplot as plt
import seaborn as sns


# Split the data into training and testing sets, stratifying by the target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=y)


# Train an MLP classifier
mlp_classifier = MLPClassifier(random_state=42, max_iter=100)
mlp_classifier.fit(X_train, y_train)

# Make predictions
y_pred = mlp_classifier.predict(X_test)
y_pred_proba = mlp_classifier.predict_proba(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'ROC AUC: {roc_auc}')

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Plot ROC curve
fpr = {}
tpr = {}
n_class = y

for i in range(n_class):    
    fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_pred_proba[:, i])
    
plt.figure(figsize=(10, 7))
for i in range(n_class):
    plt.plot(fpr[i], tpr[i], linestyle='--', label=f'Class {i} vs Rest')
plt.plot([0, 1], [0, 1], 'b--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Multiclass ROC Curve')
plt.legend(loc='best')
plt.show()

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))


Accuracy: 0.6290956749672346
Confusion Matrix:
[[ 16 230]
 [ 53 464]]


ValueError: y should be a 1d array, got an array of shape (763, 2) instead.

In [51]:
from sklearn.model_selection import GridSearchCV

# Scale the data
scaler = StandardScaler()
X_train= scaler.fit_transform(X_train)
X_test= scaler.transform(X_test)


param_grid = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50)],
    'alpha': [0.0001, 0.001, 0.01],
    'learning_rate_init': [0.001, 0.01, 0.1]
}

grid_search = GridSearchCV(estimator=MLPClassifier(random_state=42, max_iter=100),
                           param_grid=param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train, y_train)
best_params = grid_search.best_params_
print("Best parameters:", best_params)

# Use the best parameters to train the model
mlp_classifier = grid_search.best_estimator_
mlp_classifier.fit(X_train, y_train)




Best parameters: {'alpha': 0.0001, 'hidden_layer_sizes': (50,), 'learning_rate_init': 0.001}




In [14]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')



Accuracy: 0.6290956749672346


In [15]:
# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(conf_matrix)

# Calculate ROC AUC
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f'ROC AUC: {roc_auc}')

# Plot confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Plot ROC curve
fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
plt.figure(figsize=(10, 7))
plt.plot(fpr, tpr, linestyle='--', label='ROC Curve')
plt.plot([0, 1], [0, 1], 'b--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='best')
plt.show()

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

# Plot learning curve
train_sizes, train_scores, test_scores = learning_curve(mlp_classifier, X_train, y_train, cv=3, scoring='accuracy', n_jobs=-1, train_sizes=np.linspace(0.1, 1.0, 10))

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)

plt.figure(figsize=(10, 7))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std, train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std, test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")

plt.title('Learning Curve')
plt.xlabel('Training examples')
plt.ylabel('Score')
plt.legend(loc='best')
plt.show()

Confusion Matrix:
[[ 16 230]
 [ 53 464]]


ValueError: y should be a 1d array, got an array of shape (763, 2) instead.