<a href="https://colab.research.google.com/github/themukil/SummerInternProject/blob/main/finalhope.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

To import necessary modules and libraries for the TensorFlow and scikit-learn code

In [None]:
from google.colab import auth
auth.authenticate_user()

Authenticating the user using the `google.colab` library in Python

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounting user's Google Drive using the `google.colab` library in Python

In [None]:
data = pd.read_csv('/content/drive/MyDrive/bq-results-20230716-111421-1689506072370/hope.csv')
data = shuffle(data, random_state=22)

data.head()

To read a CSV file from the Google Drive, shuffle the data, and display the first few rows of the dataset

In [None]:
labels = [label for label in data['label'].values]
print(labels, '\n')

To extract and print the 'label' values from a DataFrame

In [None]:
text_labels = []
for label in labels:
  if label == 1:
    text_labels.append('non-hope')
  elif label == 0:
    text_labels.append('hope')
print(text_labels)
txt_labels = [[element] for element in text_labels]
print(txt_labels)

To convert numerical labels(0, 1) into text labels(hope, non-hope) and then creating a list of lists containing the text labels

In [None]:
train_size= int(len(data) * .8)
print(f"Train size: {train_size}")
print(f"Test size: {len(data)-train_size}")

To calculate and print the train and test sizes based on a given dataset. 20% of the data from the dataset is used to train the model


In [None]:
train_labels = labels_encoded[:train_size]
test_labels = labels_encoded[train_size:]

To split encoded labels into train and test sets

In [None]:
%%writefile preprocess.py

from tensorflow.keras.preprocessing import text

class TextPreprocessor(object):
  def __init__(self, vocab_size):
    self._vocab_size = vocab_size
    self._tokenizer = None

  def create_tokenizer(self, text_list):
    tokenizer = text.Tokenizer(num_words=self._vocab_size)
    tokenizer.fit_on_texts(text_list)
    self._tokenizer = tokenizer

  def transform_text(self, text_list):
    text_matrix = self._tokenizer.texts_to_matrix(text_list)
    return text_matrix

To write the given code snippet into a file named `preprocess.py`

In [None]:
from preprocess import TextPreprocessor

VOCAB_SIZE=400# This is a hyperparameter, try out different values for your dataset

train_qs = data['text'].values[:train_size]
test_qs = data['text'].values[train_size:]

processor = TextPreprocessor()
processor._vocab_size=400
processor.create_tokenizer(train_qs)

body_train = processor.transform_text(train_qs)
body_test = processor.transform_text(test_qs)

To use the `TextPreprocessor` class from the `preprocess.py` file and preprocess text data using it (VOCAB_SIZE is set to 400)

In [None]:
print(len(body_train[0]))
print(body_train[0])

To print the length and the content of the processed text data in `body_train`

In [None]:
import pickle

with open('./processor_state.pkl', 'wb') as f:
  pickle.dump(processor, f)

To save the `processor` object using the `pickle` module

In [None]:
def create_model(vocab_size, num_tags):

  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Dense(50, input_shape=(VOCAB_SIZE,), activation='relu'))
  model.add(tf.keras.layers.Dense(25, activation='relu'))
  model.add(tf.keras.layers.Dense(num_labels, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

To create a model using the `tf.keras` library with specified layers and activation functions


In [None]:
model = create_model(VOCAB_SIZE, num_labels)
model.summary()

# Train and evaluate the model
model.fit(body_train, train_labels, epochs=10, batch_size=128, validation_split=0.1)
print('Eval loss/accuracy:{}'.format(
  model.evaluate(body_test, test_labels, batch_size=128)))

# Export the model to a file
model.save('keras_saved_model.h5')

To create, train, evaluate, and save a model as 'keras_saved_model.h5' using the `tf.keras` library (epochs = 10, batch_size = 128 and validation_split = 0.1)

In [None]:
model.evaluate(body_test, test_labels, batch_size=128)

To evaluate the model on the test data and print the evaluation metrics

In [None]:
%%writefile model_prediction.py
import pickle
import os
import numpy as np

class CustomModelPrediction(object):

  def __init__(self, model, processor):
    self._model = model
    self._processor = processor

  def predict(self, instances, **kwargs):
    preprocessed_data = self._processor.transform_text(instances)
    predictions = self._model.predict(preprocessed_data)
    return predictions.tolist()

  @classmethod
  def from_path(cls, model_dir):
    import tensorflow.keras as keras
    model = keras.models.load_model(
      os.path.join(model_dir,'keras_saved_model.h5'))
    with open(os.path.join(model_dir, 'processor_state.pkl'), 'rb') as f:
      processor = pickle.load(f)

    return cls(model, processor)


In [None]:
test_requests = ["I hope everyone's life is blessed with happiness, joy and prosperity", "The world is a horrible place and everyone must die"]

Providing input to the model

In [None]:
from model_prediction import CustomModelPrediction

classifier = CustomModelPrediction.from_path('.')
results = classifier.predict(test_requests)
print(results)

for i in range(len(results)):
  print('Predicted labels:')
  for idx,val in enumerate(results[i]):
    if val > 0.7:
      print(label_encoder.classes_[idx])
    print('\n')

Using the `CustomModelPrediction` class to make predictions and print the results

In [None]:
hope_count = len(data[data['label'] == 0])
non_hope_count = len(data[data['label'] == 1])
print("Hope count:", hope_count)
print("Non Hope count:", non_hope_count )

To count the occurrences of different labels in a DataFrame and print the counts

# Under Sampling

In [None]:
hope_count = len(data[data['label'] == 0])
non_hope_count = len(data[data['label'] == 1])
print("Hope count:", hope_count)
print("Non Hope count:", non_hope_count )

To count the occurrences of different labels in a DataFrame and print the counts

In [None]:
newdata = data[data['label'] == 0].copy()
newdata.reset_index(drop=True, inplace=True)
newdata.head()

To create a new DataFrame containing only rows with a specific label ('label' = 0) and reset the index

In [None]:
hopeless = data[data['label']==1].copy()
hopeless.reset_index(drop=True, inplace=True)
hopeless.head()

To create a new DataFrame containing rows with a specific label ('label' = 1) and reset the index

In [None]:
hopeless = hopeless.head(4000)
hopeless.head()

To limit the number of rows in the DataFrame `hopeless` to 4000 and display the first few rows

In [None]:
len(hopeless)

Length of 'hopeless' dataframe is displayed

In [None]:
newset = pd.concat([newdata, hopeless], ignore_index=True)
newset = shuffle(newset, random_state=22)
newset.head()

To concatenate two DataFrames, shuffle the combined DataFrame, and display the first few rows

In [None]:
newset.to_csv('shortenedhope.csv', index=True)
len(newset)

To save the concatenated and shuffled DataFrame as a CSV file (shortenedhope.csv) and display its length


# Newhope

In [None]:
import tensorflow as tf
import pandas as pd
import numpy as np

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.utils import shuffle
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential

In [None]:
from google.colab import auth
auth.authenticate_user()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
data = pd.read_csv('/content/drive/MyDrive/shortenedhope.csv')
data = shuffle(data, random_state=22)

data.head()

In [None]:
labels = [label for label in data['label'].values]
print(labels, '\n')

In [None]:
text_labels = []
for label in labels:
  if label == 1:
    text_labels.append('non-hope')
  elif label == 0:
    text_labels.append('hope')
print(text_labels)
txt_labels = [[element] for element in text_labels]
print(txt_labels)

In [None]:
label_encoder = MultiLabelBinarizer()
labels_encoded = label_encoder.fit_transform(txt_labels)
num_labels = len(labels_encoded[0])
print(data['text'].values[0])
print(label_encoder.classes_)
print(labels_encoded[0])

In [None]:
train_size= int(len(data) * .8)
print(f"Train size: {train_size}")
print(f"Test size: {len(data)-train_size}")

In [None]:
train_labels = labels_encoded[:train_size]
test_labels = labels_encoded[train_size:]

In [None]:
%%writefile preprocess.py

from tensorflow.keras.preprocessing import text

class TextPreprocessor(object):
  def __init__(self, vocab_size):
    self._vocab_size = vocab_size
    self._tokenizer = None

  def create_tokenizer(self, text_list):
    tokenizer = text.Tokenizer(num_words=self._vocab_size)
    tokenizer.fit_on_texts(text_list)
    self._tokenizer = tokenizer

  def transform_text(self, text_list):
    text_matrix = self._tokenizer.texts_to_matrix(text_list)
    return text_matrix

In [None]:
from preprocess import TextPreprocessor

VOCAB_SIZE=50# This is a hyperparameter, try out different values for your dataset

train_qs = data['text'].values[:train_size]
test_qs = data['text'].values[train_size:]

processor = TextPreprocessor(VOCAB_SIZE)
processor._vocab_size=50
processor.create_tokenizer(train_qs)

body_train = processor.transform_text(train_qs)
body_test = processor.transform_text(test_qs)

In [None]:
print(len(body_train[0]))
print(body_train[0])

In [None]:
import pickle

with open('./processor_state.pkl', 'wb') as f:
  pickle.dump(processor, f)

In [None]:
def create_model(vocab_size, num_tags):

  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Dense(50, input_shape=(VOCAB_SIZE,), activation='relu'))
  model.add(tf.keras.layers.Dense(25, activation='relu'))
  model.add(tf.keras.layers.Dense(num_labels, activation='sigmoid'))

  model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
  return model

In [None]:
model = create_model(VOCAB_SIZE, num_labels)
model.summary()

# Train and evaluate the model
model.fit(body_train, train_labels, epochs=10, batch_size=128, validation_split=0.1)
print('Eval loss/accuracy:{}'.format(
  model.evaluate(body_test, test_labels, batch_size=128)))

# Export the model to a file
model.save('keras_saved_model.h5')

In [None]:
model.evaluate(body_test, test_labels, batch_size=128)

In [None]:
model.save('keras_saved_model.h5')

In [None]:
%%writefile model_prediction.py
import pickle
import os
import numpy as np

class CustomModelPrediction(object):

  def __init__(self, model, processor):
    self._model = model
    self._processor = processor

  def predict(self, instances, **kwargs):
    preprocessed_data = self._processor.transform_text(instances)
    predictions = self._model.predict(preprocessed_data)
    return predictions.tolist()

  @classmethod
  def from_path(cls, model_dir):
    import tensorflow.keras as keras
    model = keras.models.load_model(
      os.path.join(model_dir,'keras_saved_model.h5'))
    with open(os.path.join(model_dir, 'processor_state.pkl'), 'rb') as f:
      processor = pickle.load(f)

    return cls(model, processor)


In [None]:
from model_prediction import CustomModelPrediction

classifier = CustomModelPrediction.from_path('.')
results = classifier.predict(test_requests)
print(results)

for i in range(len(results)):
  print('Predicted labels:')
  for idx,val in enumerate(results[i]):
    if val > 0.7:
      print(label_encoder.classes_[idx])
    print('\n')

In [None]:
hope_count = len(data[data['label'] == 0])
non_hope_count = len(data[data['label'] == 1])
print("Hope count:", hope_count)
print("Non Hope count:", non_hope_count )