In [1]:
import os
import re
from collections import defaultdict
import pandas as pd
import torch
import random
import numpy as np
import json
from sklearn.model_selection import train_test_split

import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Julia\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
def gather_aspects(reviews, aspects_spans):
    dataset = {"idx": [], "tokens": [], "class": [], "sentiment": []}
    for idx in reviews:
        review = reviews[idx]
        spans = aspects_spans[idx]
        start = 0    

        for span in spans:
            s = int(span[2])
            e = int(span[3])
            tag = span[0]
            sent = span[4]

            if s >= 0 and e <= len(review):
                cur_tokens = word_tokenize(review[start:s])
                dataset["idx"].extend([idx] * len(cur_tokens))
                dataset["tokens"].extend(cur_tokens)
                dataset["class"].extend(["O"]*len(cur_tokens))
                dataset["sentiment"].extend(["no"] * len(cur_tokens))

                span_tokens = word_tokenize(review[s:e])
                dataset["idx"].extend([idx] * len(span_tokens))
                dataset["tokens"].extend(span_tokens)
                dataset["class"].extend(["B-" + tag] + ["I-" + tag] * (len(span_tokens) - 1))
                dataset["sentiment"].extend([sent] * len(span_tokens))
                start = e

        if start:
            cur_tokens = word_tokenize(review[start:])
            dataset["idx"].extend([idx] * len(cur_tokens))
            dataset["tokens"].extend(cur_tokens)
            dataset["class"].extend(["O"]*len(cur_tokens))
            dataset["sentiment"].extend(["no"] * len(cur_tokens))
    return pd.DataFrame(dataset)

In [3]:
def gather_cats(reviews, cats):
    ds_cats = {"idx": [], "text": [], "Food": [], "Interior": [], "Price": [], "Whole": [], "Service": []}
    for idx in reviews:
        ds_cats["idx"].append(idx)
        ds_cats["text"].append(reviews[idx])
        ds_cats["Food"].append(cats[idx][0])
        ds_cats["Interior"].append(cats[idx][1])
        ds_cats["Price"].append(cats[idx][2])
        ds_cats["Whole"].append(cats[idx][3])
        ds_cats["Service"].append(cats[idx][4])
    return pd.DataFrame(ds_cats)

In [4]:
def make_datasets(aspect_filename, cats_filename, reviews_filename):
    aspects_spans = defaultdict(list)
    with open(aspect_filename, encoding="utf-8") as file:
        for line in file:
            line = line[:-1]
            idx, cls, token, start, end, mark = line.split("\t")
            aspects_spans[idx].append([cls, token, start, end, mark])
            
    reviews = dict()
    with open(reviews_filename, encoding="utf-8") as file:
        for line in file:
            line = line[:-1]
            idx, text = line.split("\t")
            reviews[idx] = text
              
    df_aspects = gather_aspects(reviews, aspects_spans)
              
    cats = defaultdict(list)
    with open(cats_filename, encoding="utf-8") as file:
        for line in file:
            line = line[:-1]
            idx, cat, mark = line.split("\t")
            cats[idx].append(mark)
              
    df_cats = gather_cats(reviews, cats)

    return df_cats, df_aspects

In [5]:
df_cats, df_aspects = make_datasets(
    "data/train_aspects.txt", 
    "data/train_cats.txt", 
    "data/train_reviews.txt"
    )

In [6]:
indices = df_cats.idx.tolist()

In [7]:
train, test = train_test_split(indices, test_size=0.2, random_state=123)
val, test = train_test_split(test, test_size=0.5, random_state=123)

In [8]:
len(train), len(val), len(test)

(227, 28, 29)

In [9]:
df_cats_train = df_cats[df_cats["idx"].isin(train)]
df_cats_val = df_cats[df_cats["idx"].isin(val)]
df_cats_test = df_cats[df_cats["idx"].isin(test)]

df_aspects_train = df_aspects[df_aspects["idx"].isin(train)]
df_aspects_val = df_aspects[df_aspects["idx"].isin(val)]
df_aspects_test = df_aspects[df_aspects["idx"].isin(test)]

In [10]:
df_cats_train.to_csv("data/cats_train.csv")
df_cats_val.to_csv("data/cats_val.csv")
df_cats_test.to_csv("data/cats_test.csv")

df_aspects_train.to_csv("data/aspects_train.csv")
df_aspects_val.to_csv("data/aspects_val.csv")
df_aspects_test.to_csv("data/aspects_test.csv")

In [17]:
def rewrite_files(old_file, new_file):
    with open(old_file, encoding="utf-8") as file_read:
        with open(new_file, "w", encoding="utf-8") as file_write:
            for line in file_read:
                idx = line.split("\t")[0]
                if idx in test:
                    file_write.write(line)

In [19]:
rewrite_files("data/train_aspects.txt", "data/dev_aspects.txt")
rewrite_files("data/train_cats.txt", "data/dev_cats.txt")
rewrite_files("data/train_reviews.txt", "data/dev_reviews.txt")