# Basic Analysis & Traditional NN
##### This notebook contains code for analysing and classifying unstructured data using traditional Neural Network and Machine Learning methods, specifically RNN, LSTM and GRU, with tweaks in hyperparameters and plots showing the results.

In [1]:
# imports
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import Dataset, DataLoader

In [2]:
# First get an idea of how the entire .csv looks like
df = pd.read_csv("data/conv_history_2chatbot.csv", header=0)
first_last = pd.concat([df.head(), df.tail()])

print("Overview of the dataset:")
print(first_last.to_string())
print(df.shape)

# Check for missing values
missing_values = df.isnull().sum()
print("\nMissing values:")
print(missing_values)

Overview of the dataset:
       message_id  conversation_id                         timestamp  chatbot_id          Course                               hash_id     sender                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [3]:
# Check how many unique values are in the 'Course' column
unique_courses = df['Course'].nunique()
print(f"\nNumber of unique courses: {unique_courses}")

# Name of courses
course_names = df['Course'].unique()
print("\nUnique course names:")
print(course_names)

# Find how many instances of "image input" there are in the "text" column
image_input_count = df[df['text'].str.contains('image input')].shape[0]
print(f"\nNumber of instances of 'image input' in the 'text' column: {image_input_count}")


Number of unique courses: 2

Unique course names:
['My Math Mentor' 'AB1202']

Number of instances of 'image input' in the 'text' column: 954


Note: Llama-2 may not run locally due to RAM requirements. Code has been excluded.

### Llama 2 sample output

In [15]:
df_sample200 = pd.read_csv("remote_outputs/labeled_head200.csv")
first_last = pd.concat([df.head(), df.tail()])

print("Overview of the dataset:")
print(first_last.to_string())
print(df.shape)

Overview of the dataset:
       message_id  conversation_id                         timestamp  chatbot_id          Course                               hash_id     sender                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                              

In [16]:
for label in df_sample200["bloom_label"].unique():
    print(f"Label: {label}")
    print(df_sample200[df_sample200["bloom_label"] == label]["text"])
    print("\n")


Label: Understanding
0                  differentiate sin2y with respect to x
4                           What is De Moivre's Theorem?
5      De Moivre's Theorem is a fundamental result in...
8      why do we rationalise both numerator and denom...
9                differentiate sin(2y) with respect to x
                             ...                        
188    The small-angle approximation is a useful tool...
192     How to find distance between a point and a line?
193                                       what is sgn(x)
198      How do I know if I have a firm grasp on limits?
199    Understanding limits is a fundamental part of ...
Name: text, Length: 64, dtype: object


Label: Analyzing
1       Determine the following limit\nlimx→9x3−93x−−√−3
2                         how do we know the value of k 
3      To differentiate \( \sin^2(y) \) with respect ...
7      no like how do we know the value of k for argu...
10     To differentiate \( \sin(2y) \) with respect t...
          