In [1]:
from nltk.corpus.reader.wordnet import Lemma
import nltk

# For file reading
import os
from os import listdir
# For pre-processing 
from nltk import word_tokenize
from nltk.tokenize import punkt
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem import PorterStemmer

import re
import pandas as pd
import numpy as np
import string
import random

import torch
import torch.nn as nn
import torch.nn.functional as F
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import Counter
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

In [2]:
def read_data(path: str) -> list:
    """ Read files from a directory and then append the data of each file into a list. """
    folder = listdir(path)
    res = []
    for files in folder:
        # check if current path is a file
        if files != "README.txt":
            filePath = os.path.join(path, files)
            if os.path.isfile(filePath):
                with open(filePath, "r") as file:
                    lines = file.readlines()
                res.append(lines)
    return res


In [3]:
path = "/Users/rubyli/Desktop/GitHubRepos/UoM/NLP-Distributional-Semantics-Neural-Network-for-Classifying-Product-Reviews/product_reviews"

# Reading documents
res = read_data(path)
# print(res[1][:5])

In [4]:
# Getting avaliable data for LSTM
avaliable_data = []
for document in res:
    # Remove \n from the string list
    processedList = [s.strip() for s in document]
    # Remove [t] tag
    processedList = [re.sub('\[t\]', '', word) for word in processedList]
    # Store values in list
    for sentence in processedList:
        # Remove non-tagged data and empty data
        if (not sentence.startswith("##")) and (sentence != ""):
            avaliable_data.append(sentence)

In [5]:
for sen in avaliable_data[-10:]:
    print(sen, "\n")

size[+1][u],pictures[+2]##After getting one for Christmas from Amazon, I'm now a believer:  The Digital Elph is truly the world's smallest digital camera that shoots great pictures. 

pictures[+2]##I've had this camera just over 2 years and it takes pretty good pictures. 

size[+3]##The size is tiny, which I love. 

pictures[-1]##Sometimes the pictures are onthe dark side, but I'm able to fix that using the Photoshop 6.0 "adjust levels" function ... not sure how a person would do that without Photoshop, though. 

ease of use[+3][u]##This camera is so easy to use! 

features[+3]##All the features are incredibly intuitive, even for a novice like me. 

pictures[+3],size[+3]##The pictures are beautiful, and the camera is so small you can carry it anywhere. 

price[+2],features[+2],size[+2],ease of use[+2]##I, as many others, have waited for many years for the convergence of price, features, size and ease of use to hit that happy center point. 

fun[+3]##I bought this little guy a few weeks

In [6]:
print(len(avaliable_data))
print(avaliable_data[-10:])

2138
["size[+1][u],pictures[+2]##After getting one for Christmas from Amazon, I'm now a believer:  The Digital Elph is truly the world's smallest digital camera that shoots great pictures.", "pictures[+2]##I've had this camera just over 2 years and it takes pretty good pictures.", 'size[+3]##The size is tiny, which I love.', 'pictures[-1]##Sometimes the pictures are onthe dark side, but I\'m able to fix that using the Photoshop 6.0 "adjust levels" function ... not sure how a person would do that without Photoshop, though.', 'ease of use[+3][u]##This camera is so easy to use!', 'features[+3]##All the features are incredibly intuitive, even for a novice like me.', 'pictures[+3],size[+3]##The pictures are beautiful, and the camera is so small you can carry it anywhere.', 'price[+2],features[+2],size[+2],ease of use[+2]##I, as many others, have waited for many years for the convergence of price, features, size and ease of use to hit that happy center point.', 'fun[+3]##I bought this little

In [32]:
# Store review and sentiment seperatly in 2 lists
raw_review = []
raw_sentiment = []

temp_tag = []
temp_text = []
for data in avaliable_data:
    
    split_data = data.split("##")
    
    left_tag = split_data[0]
    right_text = split_data[-1]

    # print("Before:   ",left_tag)

    # Get pre-defined sentiment tags
    num_list = re.findall(r'[+|-]+\d+', left_tag)
    # Replace all tag with + or - 
    
    
    num_only = re.findall(r'\[\d+\]', left_tag)
    
    new_num_only = [num.replace('[', '') for num in num_only]
    new_num_only = [num.replace(']', '') for num in new_num_only]
    num_list = num_list +  new_num_only
        
    # num_list = [num.replace('+', '') for num in num_list]

    temp_tag.append(num_list)
    temp_text.append(right_text)

    
    # if count >= 0:
    #     sentiment.append("positive")
    # else:
    #     sentiment.append("negative")
    

    
print("Tags: \n",temp_tag[-20:])
print("Texts: \n",temp_text[-20:])

Tags: 
 [['-1'], ['-1'], ['1'], ['+2', '+2'], ['+3', '+2'], ['+3'], ['+2', '+2'], ['+1'], ['+2'], ['+1'], ['+1', '+2'], ['+2'], ['+3'], ['-1'], ['+3'], ['+3'], ['+3', '+3'], ['+2', '+2', '+2', '+2'], ['+3'], ['+3']]
Texts: 
 ['8- Buttons are little, but If you have steady fingers and dealed with a casio wirst calculator watch, you can deal with almost anything.', 'Buttons size is OK for me, my wife, but not my grany.', 'i researched and researched between getting the canon s20 or the canon s100, and of course i got the latter!', "I was looking for something that wasn't too complicated to use and to help the pics I post on Ebay show more detial.", 'This camera is great, wonderfully small, and has attactive hi-tech casing.', "Very easy to use for someone like me who's just hopping on the digital camera bandwagon.", 'The controls are easy to understand and big enough that they are easy to see and use.', 'The optical and digital zooms combine to give an effective 4x zoom.', 'This is enough

In [None]:
# temp_tag.remove(r'\w+\d+')
# print("Tags: \n",temp_tag[-20:])


#     # Get pre-defined sentiment tags
#     num_list = re.findall(r'[+|-]+\d+', left_tag)
#     num_list = [num.replace('+', '') for num in num_list]
    
#     print("After:\n", num_list)
#     for num in num_list:
#         count = 0
#         count += int(num)
#     # print(count)
    
#     if count >= 0:
#         sentiment.append("positive")
#     else:
#         sentiment.append("negative")
    
# print(num_list) 
# print(sentiment)

In [None]:
# Seperate training data and testing data