<a href="https://colab.research.google.com/github/yunusemravci/ML-Assignments/blob/master/A4_Tensorflow.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix

#### Import Data

In [None]:
# List of column names
column_names = [
    'family', 'product-type', 'steel', 'carbon', 'hardness','temper_rolling', 'condition', 'formability', 'strength',
    'non-ageing', 'surface-finish', 'surface-quality', 'enamelability', 'bc', 'bf',
    'bt', 'bw/me', 'bl', 'm', 'chrom', 'phos', 'cbond', 'marvi', 'exptl', 'ferro',
    'corr', 'blue/bright/varn/clean', 'lustre', 'jurofm', 's', 'p', 'shape', 'thick', 'width', 'len', 'oil',
    'bore', 'packing', 'classes'
]

data_train = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/HW4/annealing/anneal.data", delimiter=',', header=None)
data_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/HW4/annealing/anneal.test", delimiter=',', header=None)

data_train.columns = column_names
data_test.columns = column_names

#### Preprocessing Step

In [None]:
def preprocess_data(df):
    # Extract the 'classes' column before preprocessing
    classes_col = df['classes']

    # Drop the 'classes' column from the dataframe
    df = df.drop(columns='classes')

    # Replace '?' with NaN using numpy's nan
    df = df.replace('?', np.nan)

    # Fill numerical columns with mean
    for col in df.select_dtypes(include='number').columns:
        df[col] = df[col].fillna(df[col].mean())

    # Fill categorical columns with the most frequent value
    for col in df.columns:
        if df[col].dtype == 'object':
            df[col] = df[col].fillna(df[col].mode().iloc[0])

    # Convert categorical data
    df = pd.get_dummies(df, drop_first=True)

    # Append back the 'classes' column
    df['classes'] = classes_col

    return df


#### Apply Prepocesses Method for both train and test data

In [None]:
# preprocess the data
train_df = preprocess_data(data_train)
test_df = preprocess_data(data_test)

#### Column Matching
Preprocessing step makes conflict on test data. In this case, columns of test dataset should be matched with training dataset

In [None]:
# Ensure columns in test_df match those in train_df
for col in train_df.columns:
    if col not in test_df.columns:
        test_df[col] = 0

# Now, reorder the columns of test_df to match train_df
test_df = test_df[train_df.columns]


#### Drop NaN columns
All colums listed below include NaN which is decreasing the model accuracy. So that, these colums are removed from test and train datasets

In [None]:
columns_to_drop = ['m', 'marvi', 'corr', 'jurofm', 's', 'p']
train_df.drop(columns=columns_to_drop, inplace=True)
test_df.drop(columns=columns_to_drop, inplace=True)

#### Normalization with Standard Scaler

In [None]:
# Normalize data
scaler = StandardScaler()

X_train = scaler.fit_transform(train_df.drop(columns=['classes']))
y_train = train_df['classes']

X_test = scaler.transform(test_df.drop(columns=['classes'])) # Use the same scaler fitted on training data
y_test = test_df['classes']


#### Mapping for classes

In [None]:
# Create a mapping
class_mapping = {label: idx for idx, label in enumerate(y_train.unique())}
print(class_mapping)  # To see the created mapping

y_train = y_train.map(class_mapping)
y_test = y_test.map(class_mapping)


{'3': 0, 'U': 1, '1': 2, '5': 3, '2': 4}


#### Model Creation

In [None]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
    tf.keras.layers.Dense(32, activation='relu'),
    tf.keras.layers.Dense(y_train.nunique(), activation='softmax')
])
optimizer = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stop])


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100


#### Print Loss and Accuracy

In [None]:
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

Test Loss: 4.3409
Test Accuracy: 0.7800


#### Print Classification Report

In [None]:
y_pred = model.predict(X_test)
y_pred_classes = np.argmax(y_pred, axis=1)  # Convert predicted probabilities to class labels

print(classification_report(y_test, y_pred_classes))


              precision    recall  f1-score   support

           0       0.80      0.99      0.88        76
           1       0.83      0.83      0.83         6
           3       0.00      0.00      0.00         7
           4       0.00      0.00      0.00        11

    accuracy                           0.80       100
   macro avg       0.41      0.46      0.43       100
weighted avg       0.66      0.80      0.72       100



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Print Confusion Matrix

In [None]:
print(confusion_matrix(y_test, y_pred_classes))

[[75  1  0  0]
 [ 1  5  0  0]
 [ 7  0  0  0]
 [11  0  0  0]]
