In [17]:
import pandas as pd
import spacy
import json
import re

In [18]:
# Lower and Tokenize the text. No SpaCY tokenization
def tokenize_text(review_text):
    
    text = review_text.lower()
    
    text = text.encode('ascii', 'ignore').decode('ascii')
    
    # Remove '@name'
    text = re.sub(r'(@.*?)[\s]', ' ', text)

    # Replace '&amp;' with '&'
    text = re.sub(r'&amp;', '&', text)

    # Remove trailing whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    tokens = text.split()
    
    if len(tokens) >= 508:
        tokens = tokens[:508]
    
    return tokens

In [19]:
print(tokenize_text("@united I'm having issues. Yesterday I rebooked for 24 hours after I was supposed to fly, now I can't log on &amp; check in. Can you help?"))

["i'm", 'having', 'issues.', 'yesterday', 'i', 'rebooked', 'for', '24', 'hours', 'after', 'i', 'was', 'supposed', 'to', 'fly,', 'now', 'i', "can't", 'log', 'on', '&', 'check', 'in.', 'can', 'you', 'help?']


In [20]:
root_path = './dataset/Full Dataset/'
experiment_path = './dataset/Experiment/'

In [21]:
train_dataset = pd.read_json (root_path + 'train.json', encoding = 'utf-8')
valid_dataset = pd.read_json (root_path + 'val.json', encoding = 'utf-8')
test_dataset = pd.read_json (root_path + 'test.json', encoding = 'utf-8')

print('Total Training Samples:', len(train_dataset))
print('Total Valid Samples:', len(valid_dataset))
print('Total Test Samples:', len(test_dataset))

Total Training Samples: 145381
Total Valid Samples: 8080
Total Test Samples: 8080


In [22]:
train_dataset.iloc[12]

user_review_posted                                                         51
user_total_helpful_votes                                                   43
expertise                                                            0.003983
user_cities_visited                                                        42
review_days                                                          0.243702
helpful_class                                                               0
review_text                 We stayed here for three nights and would defi...
Name: 12, dtype: object

In [23]:
def data_fomat(dataset):
  
  save_list = []

  for index, row in dataset.iterrows():
    temp_dict = {}
    temp_dict["expertise"] = row["expertise"]
    temp_dict["review_days"] = row["review_days"]

    temp_dict["helpful_class"] = row["helpful_class"]
    temp_dict["review_text"] = tokenize_text(row["review_text"])

    save_list.append(temp_dict)
  
  return save_list

### Train Data Format

In [None]:
train_save_list = data_fomat(train_dataset)

with open(experiment_path + 'train.json', 'w', encoding='utf-8') as file:
  for sample in train_save_list:
    file.write(json.dumps(sample))
    file.write("\n")

### Valid Data Format

In [None]:
valid_save_list = data_fomat(valid_dataset)

with open(experiment_path + 'valid.json', 'w', encoding='utf-8') as file:
  for sample in valid_save_list:
    file.write(json.dumps(sample))
    file.write("\n")

### Test Data Format

In [None]:
test_save_list = data_fomat(test_dataset)

with open(experiment_path + 'test.json', 'w', encoding='utf-8') as file:
  for sample in test_save_list:
    file.write(json.dumps(sample))
    file.write("\n")