In [8]:
import matplotlib.pyplot as plt
import pandas as pd
import torch
import numpy as np

from torchtext.data import Field, TabularDataset, BucketIterator, Iterator

import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification

import torch.optim as optim

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
from utils import load_metrics

## Preprocess

In [4]:
des_path = 'laptop/'
needs_file_path = des_path + 'needs_byasin.csv'
needs_preprocessed_path = des_path + 'needs.csv'
review_file_path = des_path + 'amazon_reviews.csv'

novel_file_path = des_path + 'novel_needs.xlsx'
novel_needs_csv_path = des_path + 'novel_needs.csv'

In [4]:
train_test_ratio = 0.90
train_valid_ratio = 0.80
first_n_words = 200

In [5]:
def trim_string(x):

    x = x.split(maxsplit=first_n_words)
    x = ' '.join(x[:first_n_words])

    return x

In [6]:
needs = pd.read_csv(needs_file_path, index_col=0)

data_needs = needs.iloc[:,0].astype(str)
data_needs = data_needs[data_needs != 'nan']

for index in range(1, needs.shape[1]):
    temp = needs.iloc[:,index].astype(str)
    temp = temp[temp != 'nan']
    data_needs = pd.concat([data_needs, temp])

data_needs.reset_index(drop=True, inplace=True)
data_needs.to_csv(needs_preprocessed_path, index=0)

data_needs = pd.read_csv(needs_preprocessed_path)

data_needs["label"] = 1
data_needs['0'] = data_needs['0'].apply(trim_string)
data_needs.rename(columns={'0': 'text'}, inplace=True)


data_review = pd.read_csv(review_file_path)
data_review["label"] = 0
data_review['0'] = data_review['0'].apply(trim_string)
data_review.rename(columns={'0': 'text'}, inplace=True)


In [7]:
# Train - Test
df_need_full_train, df_need_test = train_test_split(data_needs, train_size = train_test_ratio, random_state=1)
df_review_full_train, df_review_test = train_test_split(data_review, train_size = train_test_ratio, random_state=1)

In [8]:
df_need_full_train.shape, df_need_test.shape

((963, 2), (107, 2))

In [9]:
# Train - valid
df_need_train, df_need_valid = train_test_split(df_need_full_train, train_size = train_valid_ratio, random_state=1)
df_review_train, df_review_valid = train_test_split(df_review_full_train, train_size = train_valid_ratio, random_state=1)

In [10]:
df_need_train.shape, df_need_valid.shape

((770, 2), (193, 2))

In [11]:
df_train = pd.concat([df_need_train, df_review_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_need_valid, df_review_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_need_test, df_review_test], ignore_index=True, sort=False)

In [12]:
df_train.to_csv(des_path + 'train.csv', index=False)
df_valid.to_csv(des_path + 'valid.csv', index=False)
df_test.to_csv(des_path + 'test.csv', index=False)

In [11]:
# preprocess novel data
novel_needs = pd.read_excel(novel_file_path, index_col=0)

In [12]:
novel_needs["label"] = 1
novel_needs.rename(columns={'make-up reviews': 'text'}, inplace=True)

In [13]:
novel_needs

Unnamed: 0_level_0,text,label
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,The laptop is very small and portable which I ...,1
2,I mainly use my laptop to create things. This ...,1
3,"This is the smallest laptop I can find, but I ...",1
4,"This laptop is not bad, but the keyboard is ha...",1
5,The screen size of this laptop is perhaps the ...,1
6,I have a high demand for the screen. This lapt...,1
7,This gaming laptop is very good and I like it....,1
8,Bought the laptop last month and in general I ...,1
9,Received the laptop last week and so far so go...,1
10,This laptop is cool and I like it very much. H...,1


In [14]:
novel_needs.to_csv(novel_needs_csv_path, header=1, index=0)