In [None]:
#!pip install openpyxl for reading excel

In [1]:
import data_cleaning as dc
dc.main()

The first step in our pipeline involves loading the data into a pandas DataFrame. This is accomplished using the pandas library, which is imported at the beginning of the script.

In [2]:
# Step 1: Load the data
import pandas as pd

df = pd.read_csv('./data/MMNames_clean.csv')

In [3]:
# Step 2: Prepare the data
import data_preprocessing as dp
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

df = dp.preprocess_category(df,'SR_Name')
df = dp.preprocess_onehot(df,'name')

y = df['SR_Name'].values 
X = df.drop(columns=['SR_Name']).values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)


[nltk_data] Downloading package punkt to /Users/zno/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/zno/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/zno/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


(13659, 13003) (5854, 13003)


In [23]:
# Step 3: Build a NN model with TensorFlow
import tensorflow as tf

def create_classification_model(input_shape, num_classes, params={}):
    model = tf.keras.Sequential([
        tf.keras.Input(shape=input_shape),
        tf.keras.layers.Dense(128, activation='relu'),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(32, activation='relu'),
        tf.keras.layers.Dense(16, activation='relu'),
        tf.keras.layers.Dense(num_classes, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [24]:
# Step 4: Create and train the model
import time

start_time = time.time()
model = create_classification_model(input_shape=[X_train.shape[1]],num_classes=len(df['SR_Name'].unique()), )
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), verbose=0)

training_time = time.time() - start_time
print(training_time)

93.62240314483643


In [25]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_7 (Dense)             (None, 128)               1664512   
                                                                 
 dense_8 (Dense)             (None, 64)                8256      
                                                                 
 dense_9 (Dense)             (None, 32)                2080      
                                                                 
 dense_10 (Dense)            (None, 16)                528       
                                                                 
 dense_11 (Dense)            (None, 18)                306       
                                                                 
Total params: 1,675,682
Trainable params: 1,675,682
Non-trainable params: 0
_________________________________________________________________


In [26]:
# Step 5: Evaluate the model

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

y_pred = model.predict(X_test, batch_size=32, verbose=0)
y_pred = y_pred.argmax(axis=1)
report = classification_report(y_test, y_pred, output_dict=True)
report_df = pd.DataFrame(report).round(2).transpose()
report_df.to_csv('./data/cls_report_test.csv', index=False)




In [27]:
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.2801503245644004
              precision    recall  f1-score   support

           0       0.13      0.78      0.22       617
           1       0.34      0.13      0.19       273
           2       0.35      0.21      0.26       243
           3       0.95      0.18      0.30       214
           4       0.61      0.17      0.27       258
           5       0.62      0.15      0.24        54
           6       0.68      0.16      0.26       171
           7       0.54      0.23      0.32       664
           8       0.39      0.22      0.29       569
           9       0.54      0.25      0.34       199
          10       0.47      0.08      0.14        85
          11       0.64      0.22      0.33       399
          12       0.53      0.30      0.39       968
          13       0.43      0.11      0.17        82
          14       0.79      0.22      0.35       472
          15       0.52      0.18      0.26       188
          16       0.61      0.30      0.41       152
        

In [28]:
y_pred = model.predict(X_train, batch_size=32, verbose=0)
y_pred = y_pred.argmax(axis=1)
report = classification_report(y_train, y_pred, output_dict=True)
report_df = pd.DataFrame(report).round(2).transpose()
report_df.to_csv('./data/cls_report_train.csv', index=False)

In [29]:
print(accuracy_score(y_train, y_pred))
print(classification_report(y_train, y_pred))

0.86382604875906
              precision    recall  f1-score   support

           0       0.87      0.84      0.85      1561
           1       0.82      0.78      0.80       648
           2       0.75      0.82      0.78       571
           3       0.99      0.98      0.99       434
           4       0.94      0.91      0.93       620
           5       0.93      0.95      0.94       174
           6       0.96      0.88      0.92       381
           7       0.88      0.76      0.82      1580
           8       0.75      0.87      0.81      1334
           9       0.86      0.87      0.86       401
          10       0.92      0.63      0.75       149
          11       0.88      0.92      0.90       932
          12       0.82      0.88      0.85      2310
          13       0.98      0.94      0.96       175
          14       0.97      0.94      0.95      1004
          15       0.89      0.89      0.89       428
          16       0.88      0.94      0.91       372
          