# part1. The same train, test, validation datasets as previous model used

## 1. Upload train, val, test datasets

In [1]:
from google.colab import files

# Upload files
uploaded = files.upload()

Saving test_data.csv to test_data.csv
Saving train_data.csv to train_data.csv
Saving validation_data.csv to validation_data.csv


In [2]:
# check files
!ls

sample_data  test_data.csv  train_data.csv  validation_data.csv


In [3]:
#check train_data.csv
!head -n 5 'train_data.csv'

Text|Label
perfect living literature competition well made authentic appearance included dress separate apron headband bow|Positive
terrible clearly made someone basement|Negative
seemed like silly thing buy partner solar watch problem keeping prior solar watch charged weird watch never problem think bleeding charge like wear watch time unless showering maybe issue battery charge know case got little led charger basically bright flashlight stable setting delighted really flashlight would difficult keep watch top upended flashlight design work well two little silicone cup provided watch different size small worked well far|Positive
used belt daily excited belt clean look buckle hole would tear unfortunately month use buckle broke took good care belt otherwise great condition pin hold ratchet place simply fell going buy color look like ill moving something else|Negative


## 2. Make data folder structure

In [12]:
import pandas as pd
import os

# Read triage CSV files
train_data = pd.read_csv('train_data.csv', delimiter='|')
validation_data = pd.read_csv('validation_data.csv', delimiter='|')
test_data = pd.read_csv('test_data.csv', delimiter='|')

# Specify train, validation, and test folder paths
train_folder_path = '/content/bert_same_data/train'
validation_folder_path = '/content/bert_same_data/validation'
test_folder_path = '/content/bert_same_data/test'

# Make train, validation, and test folders
os.makedirs(train_folder_path, exist_ok=True)
os.makedirs(validation_folder_path, exist_ok=True)
os.makedirs(test_folder_path, exist_ok=True)

# Make three subclass folders under train, val, and test
labels = ['2', '0', '1']
for label in labels:
    os.makedirs(os.path.join(train_folder_path, f'class_{label}'), exist_ok=True)
    os.makedirs(os.path.join(validation_folder_path, f'class_{label}'), exist_ok=True)
    os.makedirs(os.path.join(test_folder_path, f'class_{label}'), exist_ok=True)

# Function to save text to files
label_mapping = {'Positive': 2, 'Neutral': 0, 'Negative': 1}
def save_text_to_files(data, folder_path):
    for index, row in data.iterrows():
        # Map the text label to a numerical label
        numerical_label = label_mapping[row['Label']]
        # Construct the file path
        file_path = os.path.join(folder_path, f'class_{numerical_label}', f'{index}.txt')
        # Write the text to the file
        with open(file_path, 'w') as file:
            file.write(str(row['Text']))

# Save texts to files
save_text_to_files(train_data, train_folder_path)
save_text_to_files(validation_data, validation_folder_path)
save_text_to_files(test_data, test_folder_path)

In [27]:
# check folder structure
!ls /content/bert_same_data
!ls /content/bert_same_data/train
!ls /content/bert_same_data/train/class_0 | head -5
!head -n 5 /content/bert_same_data/train/class_2/0.txt

test  train  validation
class_0  class_1  class_2
10007.txt
10008.txt
1001.txt
10021.txt
10032.txt
perfect living literature competition well made authentic appearance included dress separate apron headband bow

## 3. Download folders



In [None]:
!zip -r /content/bert_same_data.zip /content/bert_same_data

In [34]:
files.download('/content/bert_same_data.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 4. Uploaded to Box

link:

 https://tufts.box.com/shared/static/9tcv9p4zc1q9d5fb80weqtvm2los1len.zip

# Part2: Re-preprocess text data


In [None]:
# upload cleaned_data.csv (but use uncleaned reviewText column)
files.upload()

In [36]:
# give this notebook access to drive files
from google.colab import drive
drive.mount('/content/drive')

path = '/content/drive/My Drive/cleaned_data.csv'
data = pd.read_csv(path)
data.sample(5)

Mounted at /content/drive


Unnamed: 0,overall,reviewTime,asin,reviewText,summary,title,brand,rank,category,cleaned_reviewText
13616,4.0,2018-06-14,B00US1F47K,"Loved the print, and fabric, they just were to...",I reordered in correct size.,i play. Baby Reusable Absorbent Swim Diapers 2...,i play.,20680.0,"Clothing,ShoesJewelry",loved print fabric big
10705,5.0,2018-06-13,B00NIVAEG8,Nice hat,Five Stars,O'Neill Men's Sonoma Prints Straw Hat,,28053.0,"Clothing,ShoesJewelry",nice hat
7647,5.0,2018-04-15,B00G8Q7JZ4,"I've had tungsten carbide rings for years, bec...",White Tungsten is the way to go.,MJ Metals Jewelry 2mm to 10mm White Tungsten C...,MJ Metals Jewelry,8871.0,"Clothing,ShoesJewelry",ive tungsten carbide ring year like dull silve...
3705,3.0,2018-01-09,B000YFSR4W,No pockets,Three Stars,Hanes Mens EcoSmart Fleece Sweatpant,,13395.0,"Clothing,ShoesJewelry",pocket
31498,4.0,2018-04-29,B00008JVTT,Ive been looking for shirts like these for a w...,Ive been looking for shirts like these for a w...,Paul Fredrick Men's Pinpoint Snap Tab Collar F...,,4785781.0,"Clothing,ShoesJewelry",ive looking shirt like found im buying future ...


In [37]:
def assign_sentiment_class(rating):
    if rating <= 2.0:
        return 'Negative'
    elif rating == 3.0:
        return 'Neutral'
    else:
        return 'Positive'

data['sentiment_class'] = data['overall'].apply(assign_sentiment_class)
data.sample(5)

Unnamed: 0,overall,reviewTime,asin,reviewText,summary,title,brand,rank,category,cleaned_reviewText,sentiment_class
43011,4.0,2018-01-07,B011W0FI58,The tiara was beautiful and as heavy as i expe...,Loved it! Just didn't get the one in the picture.,Disney Store Princess Cinderella Costume Tiara,,1530016.0,"Clothing,ShoesJewelry",tiara beautiful heavy expected however tiara o...,Positive
3306,5.0,2018-01-04,B000YFSR4W,Great considering the price of $7!,Buy a size smaller than you think you need.,Hanes Mens EcoSmart Fleece Sweatpant,,13395.0,"Clothing,ShoesJewelry",great considering price !,Positive
39523,5.0,2018-06-05,B00T7MYZ9O,I recommend it,Looks really nice on - flattering and comfortable,S.L. Fashions Women's Solid Chiffon Halter Dress,S.L. Fashions,331362.0,"Clothing,ShoesJewelry",recommend,Positive
29420,3.0,2018-03-25,B01FQ114LG,Fianc likes how it fits but the straps were se...,Fits well...Sewn incorrectly.,Queenie Ke Women's Light Support Double-T Back...,,8402.0,"Clothing,ShoesJewelry",fianc like fit strap sewn twisted one side str...,Neutral
12152,5.0,2018-05-18,B00RLSCLJM,Great product and service. Just what I was lo...,Five Stars,MJ Metals Jewelry 2mm to 10mm White Tungsten C...,MJ Metals Jewelry,6926.0,"Clothing,ShoesJewelry",great product service looking wedding band rep...,Positive


## 1.1 Basic text cleaning - when running, can skip this part

1. use cleaned_data.csv 's origianl reveiwText column
2. creat a new bert_cleaned_reviewText column
2. create a label columns as previous notebook's

In [42]:


import re

In [49]:
## Need some basic cleanig, like removing HTML tags and spell check, but need to retain stopwords, punctations, etc...... and didn't do spell check this time

# rewrite text clean function and re_clean data

# Function to clean text
def clean_review_text(text):
    '''clean review text for BERT model training'''
    # Remove HTML tags
    text = re.sub(r'<[^>]+>', '', text)
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'www\.\S+', '', text)
    # Remove file paths and file extensions
    text = re.sub(r'\S+\.(jpg|png|gif|jpeg|mp4|avi)', '', text)
    # Lowercasing
    text = text.lower()
    # Handling Unicode characters by removing non-ascii characters
    text = text.encode('ascii', 'ignore').decode()
    return text


In [50]:
# Check cleaning result on a small sample
sample = data['reviewText'].sample(10)
sample.apply(clean_review_text)

33525    this suit fits very well and maintains its sha...
31709                                           very good.
36011    second kilt from utkilts. very pleased. fits p...
24846    genuine leather. the pocket to slide the check...
20972                                              love it
56878    it is in fact a pin with two beautiful flags i...
43641          love it. i use it every day. great quality.
9127     item fit great! very comfortable, i am 5'7" an...
23121    got this for my mom what great nicely made han...
39371    i used this for a fursuit head, actually. it's...
Name: reviewText, dtype: object

In [51]:
data['basic_cleaned_reviewText'] = data['reviewText'].apply(clean_review_text)
data.sample(5)

Unnamed: 0,overall,reviewTime,asin,reviewText,summary,title,brand,rank,category,cleaned_reviewText,sentiment_class,basic_cleaned_reviewText
14061,5.0,2018-03-11,B00W3XUB88,Attractive and sturdy. Better than I hoped.,Five Stars,Reading Glasses for Women and Men - Best 4 Pac...,TruVision Readers,13661.0,"Clothing,ShoesJewelry",attractive sturdy better hoped,Positive,attractive and sturdy. better than i hoped.
35546,1.0,2018-07-16,B00ICFBVLW,Cheap and took forever to arrive. Total disapp...,Cheap. Turns piercing holes green.,amtonseeshop Specialized Brand new 40PCS Fashi...,amtonseeshop,2238905.0,"Clothing,ShoesJewelry",cheap took forever arrive total disappointment...,Negative,cheap and took forever to arrive. total disapp...
28993,2.0,2018-04-18,B01FKVMSF8,"This is dress was very cute, however the bust ...",Not for large busted ladies,Beachcoco Women's Maternity Printed Light Weig...,Beachcoco,5798473.0,"Clothing,ShoesJewelry",dress cute however bust come near right size i...,Negative,"this is dress was very cute, however the bust ..."
24792,5.0,2018-02-11,B01AY789DI,"Cute, adorable and great for the price. Bough...",Dainty and pretty,JewelryPalace Diamond Birthstone Promise Ring ...,JewelryPalace,594628.0,"Clothing,ShoesJewelry",cute adorable great price bought daughter abso...,Positive,"cute, adorable and great for the price. bough..."
80,5.0,2018-05-07,B000EVEAS8,"At 5' 4 "" and 120 lbs., the small was perfect....",the small was perfect. There was plenty of roo...,"Plush Microfiber Robe - Soft, Warm, and Lightw...",Plush Necessities,43313.0,"Clothing,Shoesamp;Jewelry",la small perfect plenty room still feel totall...,Positive,"at 5' 4 "" and 120 lbs., the small was perfect...."


In [52]:
data.to_csv('/content/drive/My Drive/basic_cleaned_data.csv', index=False)

In [53]:
# Download the file
files.download('/content/drive/My Drive/basic_cleaned_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [61]:
# train, validation, test data split --- use data['sentiment_class'] as y label
from sklearn.model_selection import train_test_split

train_ratio = 0.75
validation_ratio = 0.15
test_ratio = 0.10


# train: 75%, validation: 15%, test: 10%
x_train, x_test, y_train, y_test = train_test_split(data['basic_cleaned_reviewText'], data['sentiment_class'], test_size=1 - train_ratio, random_state=42)
x_val, x_test, y_val, y_test = train_test_split(x_test, y_test, test_size=test_ratio/(test_ratio + validation_ratio), random_state=42)


In [62]:
# download train, val, test dataset for later use (bert) -- in the format of text | label

# Convert to DataFrame
basic_train_df = pd.DataFrame({'Text': x_train, 'Label': y_train})
basic_val_df = pd.DataFrame({'Text': x_val, 'Label': y_val})
basic_test_df = pd.DataFrame({'Text': x_test, 'Label': y_test})
# Export to CSV with '|' as the separator
basic_train_df.to_csv('basic_train_data.csv', sep='|', index=False)
basic_val_df.to_csv('basic_validation_data.csv', sep='|', index=False)
basic_test_df.to_csv('basic_test_data.csv', sep='|', index=False)
print(pd.read_csv('basic_train_data.csv', sep='|').head())

files.download('basic_train_data.csv')
files.download('basic_validation_data.csv')
files.download('basic_test_data.csv')

                                                Text     Label
0  it is a gorgeous watch. my daughter loved it. ...  Positive
1                            perfect\n gift for teen  Positive
2  right size for current wallet and with a place...  Positive
3  what i liked the most was the laces held their...  Positive
4  i loved my toe ring i could have sworn i order...  Positive


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## Now, repeat the same process as part1

In [63]:
# Upload files
uploaded = files.upload()

Saving basic_test_data.csv to basic_test_data (1).csv
Saving basic_train_data.csv to basic_train_data (1).csv
Saving basic_validation_data.csv to basic_validation_data (1).csv


In [64]:
# check files
!ls

'basic_test_data (1).csv'	  basic_validation_data.csv   drive
 basic_test_data.csv		  bert			      sample_data
'basic_train_data (1).csv'	  bert_same_data	      test_data.csv
 basic_train_data.csv		  bert_same_data.zip	      train_data.csv
'basic_validation_data (1).csv'   cleaned_data.csv	      validation_data.csv


In [65]:
#check train_data.csv
!head -n 5 'basic_train_data.csv'

Text|Label
it is a gorgeous watch. my daughter loved it. beautiful design and great quality for children's watch. she learned how to tell the time within 2 hours of getting it. we are very pleased with it and we will definitely but our son's watch from the same company.|Positive
"perfect
 gift for teen"|Positive
right size for current wallet and with a place for 20 cards.  good value for low cost.|Positive


## 2. Make data folder structure

In [66]:
import pandas as pd
import os

# Read triage CSV files
basic_train_data = pd.read_csv('basic_train_data.csv', delimiter='|')
basic_validation_data = pd.read_csv('basic_validation_data.csv', delimiter='|')
basic_test_data = pd.read_csv('basic_test_data.csv', delimiter='|')

# Specify train, validation, and test folder paths
basic_train_folder_path = '/content/bert_basic_data/train'
basic_validation_folder_path = '/content/bert_basic_data/validation'
basic_test_folder_path = '/content/bert_basic_data/test'

# Make train, validation, and test folders
os.makedirs(basic_train_folder_path, exist_ok=True)
os.makedirs(basic_validation_folder_path, exist_ok=True)
os.makedirs(basic_test_folder_path, exist_ok=True)

# Make three subclass folders under train, val, and test
labels = ['2', '0', '1']
for label in labels:
    os.makedirs(os.path.join(basic_train_folder_path, f'class_{label}'), exist_ok=True)
    os.makedirs(os.path.join(basic_validation_folder_path, f'class_{label}'), exist_ok=True)
    os.makedirs(os.path.join(basic_test_folder_path, f'class_{label}'), exist_ok=True)

# Function to save text to files
label_mapping = {'Positive': 2, 'Neutral': 0, 'Negative': 1}
def save_text_to_files(data, folder_path):
    for index, row in data.iterrows():
        # Map the text label to a numerical label
        numerical_label = label_mapping[row['Label']]
        # Construct the file path
        file_path = os.path.join(folder_path, f'class_{numerical_label}', f'{index}.txt')
        # Write the text to the file
        with open(file_path, 'w') as file:
            file.write(str(row['Text']))

# Save texts to files
save_text_to_files(basic_train_data, basic_train_folder_path)
save_text_to_files(basic_validation_data, basic_validation_folder_path)
save_text_to_files(basic_test_data, basic_test_folder_path)

In [67]:
# check folder structure
!ls /content/bert_basic_data
!ls /content/bert_basic_data/train
!ls /content/bert_basic_data/train/class_0 | head -5
!head -n 5 /content/bert_basic_data/train/class_2/0.txt

test  train  validation
class_0  class_1  class_2
10018.txt
10022.txt
10040.txt
10044.txt
10046.txt
it is a gorgeous watch. my daughter loved it. beautiful design and great quality for children's watch. she learned how to tell the time within 2 hours of getting it. we are very pleased with it and we will definitely but our son's watch from the same company.

## 3. Download folders



In [None]:
!zip -r /content/bert_basic_data.zip /content/bert_basic_data

In [69]:
files.download('/content/bert_basic_data.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

## 4. Uploaded to Box

link:

https://tufts.box.com/shared/static/65lzdmpyito0c98gt89nktjrnuvrsjdm.zip