# Note:
- This notebook file may contain methods or algorithms that are NOT covered by the teaching content of BT4222 and hence will not be assessed in your midterm exam.
- It serves to increase your exposure in depth and breath to the practical methods in addressing the specific project topic. We believe it will be helpful for your current project and also your future internship endeavors.

# **Import Library**

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import nltk
from nltk.corpus import stopwords
import string
import re
import seaborn as sns
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import TensorDataset, DataLoader
from nltk.tokenize import word_tokenize
import gensim
from gensim.models import Word2Vec
from gensim.models.keyedvectors import KeyedVectors
from nltk.tokenize import word_tokenize
import pandas as pd
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

# **Word Preprocessing**
Here, we show the change of first sentence as an example:

Origin: I wanted some pizza...

After moving the stop words: wanted pizza looking outside ...

After tokenizing the text: ['wanted', 'pizza', 'looking', 'outside', 'box', 'found', 'local', 'pizzeria'....]




In [None]:
####################################
import gdown
# load data
file_id = '1pvDQhnGgeLWjqkHFxc3G0rkGpv6T3VN3'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'train_dataset.csv'
gdown.download(url, output, quiet=False)

file_id = '1LtTRhyhYAcXY5Yh_p8TdR_4LOvLguvZG'
url = f'https://drive.google.com/uc?id={file_id}'
output = 'test_dataset.csv'
gdown.download(url, output, quiet=False)
#train_dataset.csv and test_dataset.csv are subsets of Yelp dataset https://www.yelp.com/dataset

train_data = pd.read_csv('train_dataset.csv')
test_data = pd.read_csv('test_dataset.csv')
print("First 5 sentences and their label",train_data.head())
# get the stopword
stop_words = set(stopwords.words('english'))

def clean_text(text):
    #Converts all characters in text to lowercase.
    text = text.lower()

    #converts all characters in text to lowercase
    #replace all non-word characters (characters that are not a letter, digit, or underscore) in text with a space.
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\d', '', text)

    words = word_tokenize(text)
    #split the text into individual words.

    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

# Apply the clean_text function to the 'text' column of the training and testing datasets
train_data['text'] = train_data['text'].apply(clean_text)
print("First 5 sentences and their label after cleaning",train_data.head())
test_data['text'] = test_data['text'].apply(clean_text)

##Tokenize the 'text' column of the training and testing datasets and convert to a list
train_sentences = train_data['text'].apply(word_tokenize).tolist()
print("The first sentence", train_sentences[0])

test_sentences = test_data['text'].apply(word_tokenize).tolist()


Downloading...
From: https://drive.google.com/uc?id=1pvDQhnGgeLWjqkHFxc3G0rkGpv6T3VN3
To: /content/train_dataset.csv
100%|██████████| 256M/256M [00:05<00:00, 50.8MB/s]
Downloading...
From: https://drive.google.com/uc?id=1LtTRhyhYAcXY5Yh_p8TdR_4LOvLguvZG
To: /content/test_dataset.csv
100%|██████████| 63.8M/63.8M [00:02<00:00, 29.7MB/s]


First 5 sentences and their label                                                 text  stars
0  I wanted some pizza...but I was looking outsid...    3.0
1  Buy 3 get one free bang bang shrimp tacos - if...    5.0
2  Here on a business trip, ordered pizza. \n\nTh...    1.0
3  It's ok got the pho ga. The  broth was kinda o...    3.0
4  Stopped by during lunch time and it was obviou...    4.0
First 5 sentences and their label after cleaning                                                 text  stars
0  wanted pizza looking outside box found local p...    3.0
1  buy get one free bang bang shrimp tacos ya kno...    5.0
2  business trip ordered pizza said would minutes...    1.0
3  ok got pho ga broth kinda light side flavorful...    3.0
4  stopped lunch time obviously packed wanted try...    4.0
The first sentence ['wanted', 'pizza', 'looking', 'outside', 'box', 'found', 'local', 'pizzeria', 'well', 'love', 'supporting', 'places', 'like', 'buffet', 'night', 'really', 'wanted', 'call', 'ahe

# **Word2Vec embedding**
Here, we show how to embedding a sentence as a vector that can be sent to different kinds of models like CNN ot LSTM.

In [None]:



train=False

if train:
    # train Word2Vec model
    model = Word2Vec(sentences=train_sentences, vector_size=50, window=5, min_count=1, workers=4)
    model.save("word2vec_model_all_yours.bin")
    wv =model.wv
else:
    #or you can directly use the trained model
    file_id = '1iIZEpngke06CvLXGjyuO2XYn9mOiEZ1M'
    url = f'https://drive.google.com/uc?id={file_id}'
    output = 'model_in_vector_format.txt'
    gdown.download(url, output, quiet=False)
    wv = KeyedVectors.load_word2vec_format('model_in_vector_format.txt', binary=False)


def get_sentence_vectors(sentences):
    vectors = []
    for sentence in sentences:
        #This line creates a list of word vectors for each word in the sentence that is in the Word2Vec model's vocabulary.
        sentence_vectors = [wv[word] for word in sentence if word in wv]
        if len(sentence_vectors) == 0:
            vectors.append([0] * 50)  # If the sentence doesn't have any words that are in
            # the Word2Vec model's vocabulary, the sentence is represented by a vector of 50 zeros.
        else:
            vectors.append(np.mean(sentence_vectors, axis=0))  # Otherwise, the sentence vector is the average
            # of its word vectors. This vector is then added to the list of sentence vectors.
    return vectors

train_vectors = get_sentence_vectors(train_sentences)
print("vector of the first training sentence:", train_vectors[0])

test_vectors = get_sentence_vectors(test_sentences)

Downloading...
From: https://drive.google.com/uc?id=1iIZEpngke06CvLXGjyuO2XYn9mOiEZ1M
To: /content/model_in_vector_format.txt
100%|██████████| 82.5M/82.5M [00:04<00:00, 20.5MB/s]


vector of the first training sentence: [ 0.18806686 -0.81407547  0.2731794  -0.19380964 -0.9803709   1.2326044
 -0.31043512 -1.4522214   1.2036858  -0.2991813   0.52706    -0.6913312
  0.01106239  0.10594404  0.33188528  0.14175086 -0.48426813  0.86514705
  1.0458082   0.87707883 -0.09251958  0.15961069 -1.7640945   0.68583244
 -0.61002684  0.10527007  0.8128444   0.09110551  0.4896992  -0.651426
 -0.2042806   0.43941516  0.20847556  0.12768184  0.517038   -0.7411599
 -0.21149275 -0.11730701  0.19832464  1.0790299  -1.0857916   0.43405077
 -0.42623082 -0.68324167 -1.0194739   0.9581027   0.16632354 -0.33166024
  0.06498344 -0.72173446]


# **Creating training and testing dataset**

In [None]:
# turn to tensor
train_vectors = torch.tensor(train_vectors)
#Reshape the train_vectors tensor. The value -1 is used to infer the batch size automatically,
#while (1,50) specifies that the input size is 50 with a single channel."
train_vectors = train_vectors.reshape(-1,1,50)
test_vectors = torch.tensor(test_vectors)
test_vectors = test_vectors.reshape(-1,1,50)
print("shape of training data", train_vectors.shape, "shape of testing data",test_vectors.shape)
# get the label
train_labels = torch.tensor(train_data['stars'].values)
test_labels = torch.tensor(test_data['stars'].values)
#save the training dataset as 'train_vectors.pt' and 'train_labels.pt'
torch.save(train_vectors, 'train_vectors_yours.pt')
torch.save(train_labels, 'train_labels_yours.pt')
#save the testing dataset as 'test_vectors.pt' and 'test_labels.pt'
torch.save(test_vectors, 'test_vectors_yours.pt')
torch.save(test_labels, 'test_labels_yours.pt')

  train_vectors = torch.tensor(train_vectors)


shape of training data torch.Size([400000, 1, 50]) shape of testing data torch.Size([100000, 1, 50])
