<a href="https://colab.research.google.com/github/vsa1920/ML_prelim/blob/main/Task.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Run before executing the code
#!pip install spacy
!python -m spacy download en_core_web_lg
!pip install spektral

In [1]:
# Spacy for generating word embeddings for the node features of GNN
import spacy
nlp = spacy.load('en_core_web_lg')

In [14]:
# Hyperparameters
P = 4 # Window Size for Words while generating graphs
batch_size = 128
learning_rate = 0.01
epochs = 40

In [4]:
import numpy as np
import scipy.sparse as sp
from spektral.data import Graph, Dataset, DisjointLoader
from spektral.models import GeneralGNN

In [5]:
import tensorflow as tf
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.metrics import categorical_accuracy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.backend import argmax

In [6]:
from sklearn.metrics import f1_score, classification_report

In [7]:
import pandas as pd
data = pd.read_csv("https://raw.githubusercontent.com/vsa1920/ML_prelim/main/train.csv")
data.dropna(axis=0, inplace=True)
num_tags = len(data["Industry Classification Tag"].unique())
# Look at all the unique tags in the dataset - These will be our targets
labels = {data["Industry Classification Tag"].unique()[i]:i for i in range(num_tags)}

In [8]:
def text_to_graph(text, label):
  """
  Generates a graph from text (Business Description)
  The node features are taken to be the word vector embeddings
  """
  line_embedding = nlp(text.lower())
  #token_size = len(line_embedding[0].vector)
  n = len(line_embedding)
  x = np.array([token.vector for token in line_embedding])
  a = np.zeros((n, n))
  for i in range(n):
    for j in range(max(0, i-P), min(n, i+P)):
      a[i][j] = 1
    a[i][i] = 0
  y = np.zeros(len(labels))
  y[labels[label]] = 1
  return Graph(x=x, a=sp.csr_matrix(a), y=y)

def sample_to_graph(text):
  """
  Generates a graph from text (Business Description)
  The node features are taken to be the word vector embeddings
  This function is to generate a sample of graphs for prediction with no targets
  """
  line_embedding = nlp(text.lower())
  #token_size = len(line_embedding[0].vector)
  n = len(line_embedding)
  x = np.array([token.vector for token in line_embedding])
  a = np.zeros((n, n))
  for i in range(n):
    for j in range(max(0, i-P), min(n, i+P)):
      a[i][j] = 1
    a[i][i] = 0
  y = np.zeros(62)
  return Graph(x=x, a=sp.csr_matrix(a), y=y)   

In [19]:
class MyDataset(Dataset):
  """
  A dataset of graphs created from the Business Description using text_to_graph method.
  """
  def __init__(self, pd_dataframe, **kwargs):
    self.dataframe = pd_dataframe
    super().__init__(**kwargs)
    
  def read(self):
    graph_list = []
    for row_idx in range(len(self.dataframe)):
      graph_list.append(text_to_graph(self.dataframe.iloc[row_idx]["Business Description"], self.dataframe.iloc[row_idx]["Industry Classification Tag"]))
    return graph_list

# The following class is to make a dataset where the targets are not known
class SampleDataset(Dataset):
  """
  A dataset of graphs created from the Business Description using text_to_graph method.
  """
  def __init__(self, pd_dataframe, **kwargs):
    self.dataframe = pd_dataframe
    super().__init__(**kwargs)
    
  def read(self):
    graph_list = []
    for row_idx in range(len(self.dataframe)):
      graph_list.append(sample_to_graph(self.dataframe.iloc[row_idx]["Business Description"]))
    return graph_list

In [10]:
# This creates a dataset of graphs with node features as word vector embeddings
dataset = MyDataset(data)

In [11]:
# Fitting a General GNN Model - Better architecture can be used
# Dropout was tuned with validation set to avoid overfitting
model = GeneralGNN(len(labels), activation='softmax', dropout=0.3)
optimizer = Adam(learning_rate)
loss_fn = CategoricalCrossentropy()

In [12]:
# Code used from Spektral examples to fit a general GNN Model
# Train/validation split
np.random.shuffle(dataset)
split = int(0.8 * len(dataset))
data_tr, data_te = dataset[:split], dataset[split:]

In [91]:
# Data loader
loader_tr = DisjointLoader(data_tr, batch_size=batch_size, epochs=epochs)
loader_te = DisjointLoader(data_te, batch_size=len(data_te))

In [88]:
# Training function
@tf.function(input_signature=loader_tr.tf_signature(), experimental_relax_shapes=True)
def train_on_batch(inputs, target):
    """
    A method to use loaders to train the GNN on -
    Loaders are responsible for handling data in batches and monitoring their performance
    """
    with tf.GradientTape() as tape:
        predictions = model(inputs, training=True)
        loss = loss_fn(target, predictions) + sum(model.losses)
    gradients = tape.gradient(loss, model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    acc = tf.reduce_mean(categorical_accuracy(target, predictions))
    return loss, acc

In [89]:
# Evaluation function
def evaluate(loader):
    """
    Method to evaluate performance from the validation set generated from the samples
    """
    step = 0
    results = []
    for batch in loader:
        step += 1
        inputs, target = batch
        predictions = model(inputs, training=False)
        loss = loss_fn(target, predictions)
        acc = tf.reduce_mean(categorical_accuracy(target, predictions))
        # Convert the one-hot results into 
        truth, preds = argmax(target, axis=-1).numpy(), argmax(predictions, axis=-1).numpy()
        results.append((loss, acc, len(target)))  # Keep track of batch size
        if step == loader.steps_per_epoch:
            results = np.array(results)
            print (classification_report(truth, preds, zero_division = 0))
            return np.average(results[:, :-1], 0, weights=results[:, -1])

In [90]:
# Training loop
epoch = step = 0
results = []
for batch in loader_tr:
    step += 1
    loss, acc = train_on_batch(*batch)
    results.append((loss, acc))
    if step == loader_tr.steps_per_epoch:
        step = 0
        epoch += 1
        results_te = evaluate(loader_te)
        print(
            "Epoch {} - Train loss: {:.3f} - Train acc: {:.3f} - "
            "Test loss: {:.3f} - Test acc: {:.3f}".format(
                epoch, *np.mean(results, 0), *results_te
            )
        )

              precision    recall  f1-score   support

           0       0.79      0.62      0.70        24
           1       0.80      0.31      0.44        13
           2       1.00      0.50      0.67         8
           3       0.64      0.84      0.73        19
           4       0.62      0.43      0.51        35
           5       0.59      0.67      0.63        24
           6       0.80      0.67      0.73        12
           7       0.64      0.78      0.71        86
           8       0.64      0.67      0.65        24
           9       0.88      1.00      0.93        21
          10       0.17      0.60      0.27        15
          11       1.00      0.22      0.36         9
          12       1.00      0.46      0.63        13
          13       0.48      0.67      0.56        15
          14       0.43      0.64      0.51        14
          15       1.00      0.12      0.22        16
          16       0.17      0.56      0.26         9
          17       1.00    

In [26]:
# Read the test files
sample_set = pd.read_csv("https://raw.githubusercontent.com/vsa1920/ML_prelim/main/test.csv")
sample_set["Industry Classification Tags"] = np.nan
sample_set.head()

Unnamed: 0,Company,Business Description,Industry Classification Tags
0,3rd Rock Multimedia Ltd,3rd Rock Multimedia Limited is an India-based ...,
1,Andhra Petrochemicals Ltd,The Andhra Petrochemicals Limited is an India-...,
2,Force Motors Ltd,Force Motors Limited is a holding company. The...,
3,Diamines And Chemicals Ltd,Diamines and Chemicals Limited is a holding co...,
4,Insilco Ltd,Insilco Limited is engaged in manufacturing an...,


In [22]:
# Create a Dataset of Graphs to predict labels
sample_data = SampleDataset(sample_set)

<function dict.keys>

In [42]:
index_to_label = list(labels.keys())

772

In [84]:
# Batch loader to run on the sample dataset to generate 5 most appropriate labellings
loader_s = DisjointLoader(sample_data, batch_size=1, epochs=1)
idx = 0
for batch in loader_s:
  inputs, target = batch
  predictions = model(inputs, training=False).numpy()[0]
  max_5 = []
  for _ in range(5):
    index = int(np.argmax(predictions, axis=-1))
    max_5.append(index_to_label[index])
    predictions[index] = -np.Inf
  sample_set["Industry Classification Tags"][idx] = str(max_5).strip("[]")
  idx += 1
  


In [85]:
sample_set.head()

Unnamed: 0,Company,Business Description,Industry Classification Tags
0,3rd Rock Multimedia Ltd,3rd Rock Multimedia Limited is an India-based ...,"'Application Software', 'Systems Software', 'R..."
1,Andhra Petrochemicals Ltd,The Andhra Petrochemicals Limited is an India-...,"'Building Products', 'Diversified Support Serv..."
2,Force Motors Ltd,Force Motors Limited is a holding company. The...,"'Integrated Telecommunication Services', 'Comm..."
3,Diamines And Chemicals Ltd,Diamines and Chemicals Limited is a holding co...,"'Commodity Chemicals', 'Specialty Chemicals', ..."
4,Insilco Ltd,Insilco Limited is engaged in manufacturing an...,"'Research & Consulting Services', 'Systems Sof..."


In [86]:
# Download the processed test file
sample_set.to_csv("Processed_file.csv")
from google.colab import files
files.download("Processed_file.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>