In [13]:
#!pip install openpyxl for reading excel

In [14]:
import data_cleaning as dc
dc.main()

The first step in our pipeline involves loading the data into a pandas DataFrame. This is accomplished using the pandas library, which is imported at the beginning of the script.

In [15]:
# Step 1: Load the data
import pandas as pd

df = pd.read_csv('./data/MMNames_clean.csv')

In [4]:
# Step 2: Prepare the data
import data_preprocessing as dp
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = dp.preprocess_category(df,'SR_Name')
# df = dp.preprocess_onehot(df,'name')

y = df['SR_Name'].values 
X = dp.preprocess_textinput(df, 'name')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)


(13659, 2252) (5854, 2252)


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/zarninwayoo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/zarninwayoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/zarninwayoo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
# Step 3: Build a NN model with TensorFlow
import tensorflow as tf

def create_classification_model(input_shape, num_classes, params={}):
    model = tf.keras.Sequential([
        tf.keras.Input(shape=input_shape),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [6]:
# Step 4: Create and train the model
import time

start_time = time.time()
model = create_classification_model(input_shape=[X_train.shape[1]],num_classes=len(df['SR_Name'].unique()), )
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=0)

training_time = time.time() - start_time
print(training_time)

2025-06-14 22:02:31.368473: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


12.5975821018219


In [7]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                72096     
                                                                 
 dense_1 (Dense)             (None, 16)                528       
                                                                 
 dense_2 (Dense)             (None, 18)                306       
                                                                 
Total params: 72,930
Trainable params: 72,930
Non-trainable params: 0
_________________________________________________________________


In [8]:
print("Class distribution:")
print(pd.Series(y_train).value_counts().sort_index())

Class distribution:
0     1561
1      648
2      571
3      434
4      620
5      174
6      381
7     1580
8     1334
9      401
10     149
11     932
12    2310
13     175
14    1004
15     428
16     372
17     585
Name: count, dtype: int64


In [9]:
# Step 5: Evaluate the model

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test, batch_size=32, verbose=0)
y_pred = y_pred.argmax(axis=1)
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).round(2).transpose()
report_df.to_csv('./data/cls_report_test.csv', index=False)




In [10]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.3462589682268534
              precision    recall  f1-score   support

           0       0.36      0.39      0.38       617
           1       0.15      0.15      0.15       273
           2       0.25      0.16      0.19       243
           3       0.62      0.22      0.33       214
           4       0.47      0.43      0.45       258
           5       0.50      0.43      0.46        54
           6       0.30      0.20      0.24       171
           7       0.29      0.37      0.32       664
           8       0.26      0.24      0.25       569
           9       0.30      0.27      0.28       199
          10       0.24      0.06      0.09        85
          11       0.39      0.36      0.37       399
          12       0.33      0.48      0.39       968
          13       0.38      0.26      0.30        82
          14       0.63      0.62      0.62       472
          15       0.35      0.24      0.28       188
          16       0.34      0.25      0.29       152
        

In [11]:
y_pred = model.predict(X_train, batch_size=32, verbose=0)
y_pred = y_pred.argmax(axis=1)
report = classification_report(y_train, y_pred, output_dict=True)
report_df = pd.DataFrame(report).round(2).transpose()
report_df.to_csv('./data/cls_report_train.csv', index=False)

In [12]:
print(accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

0.6751592356687898
              precision    recall  f1-score   support

           0       0.68      0.72      0.70      1561
           1       0.59      0.54      0.56       648
           2       0.61      0.37      0.46       571
           3       0.96      0.91      0.93       434
           4       0.84      0.80      0.82       620
           5       0.92      0.83      0.88       174
           6       0.79      0.71      0.75       381
           7       0.52      0.64      0.57      1580
           8       0.60      0.55      0.57      1334
           9       0.67      0.66      0.66       401
          10       0.88      0.20      0.33       149
          11       0.73      0.70      0.71       932
          12       0.58      0.73      0.65      2310
          13       0.93      0.80      0.86       175
          14       0.87      0.90      0.88      1004
          15       0.85      0.68      0.76       428
          16       0.83      0.56      0.67       372
        