# Clean the Dataset
## This notebook cleans the text data of a given dataset.

### Note: I recommend using this on a dataset after it has passed the balancing phase.

### Install missing required libraries:
#### Note: When using Google Colab, you will have to do this everytime you open the notebook.

In [None]:
!pip install autocorrect
!pip install pyspellchecker

### Import the required libraries:

In [None]:
### JSON
import pandas as pd
import csv
import re
from spellchecker import SpellChecker
import nltk
from nltk.corpus import words
import string

In [None]:
nltk.download('words')

In [None]:
spell = SpellChecker()

### (OPTIONAL): If using google colabs, mount your drive so you can reference a file system to reading in and storing datasets.

In [None]:
#### Setup the google drive connection if needed
from google.colab import drive
drive.mount('/content/gdrive')

### Indicate the csv file that you want to read in:

In [None]:
file_in = "/content/gdrive/My Drive/Colab Notebooks/amazon_sentiment_reviews_balanced.csv"
data_col = "REVIEW"
label_col = "TAG"
df = pd.read_csv(file_in, engine = 'python');
print("Number of observations:", len(df))
df.head(5)

### Define the text preprocessing function.

In [None]:
def preprocess_reviews(reviews, labels):
  spell = SpellChecker()
  print("Number of observations to parse:", len(reviews))
  comments = []
  tags = []
  for i in range(len(reviews)):
    if reviews[i] == "" or isinstance(reviews[i], str) == False or reviews[i] == " ":
            continue
    if i%1000 == 0:
        print("Update:", i)
    reviews[i] = re.sub(r'[!?]','.',reviews[i]) # Removing special character
    reviews[i] = re.sub(r'[^.a-zA-Z0-9\s]',' ',reviews[i]) # Removing special character
    reviews[i] = re.sub('\'',' ',reviews[i]) # Removing quotes
    reviews[i] = re.sub('#','',reviews[i]) # Removing quotes
    reviews[i] = re.sub('\d',' ',reviews[i]) # Replacing digits by space
    reviews[i] = re.sub(r'\s+[a-z][\s$]', ' ',reviews[i]) # Removing single characters and spaces alongside
    reviews[i] = re.sub(r'\s+', ' ',reviews[i]) # Replacing more than one space with a single space
    if 'www.' in reviews[i] or 'http:' in reviews[i] or 'https:' in reviews[i] or '.com' in reviews[i]:
          reviews[i] = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", reviews[i])
    reviews[i] = reviews[i].lower()
    reviews[i] = reviews[i].rstrip()
    spot = reviews[i].find(' .')
    while spot != -1: # Fix lone periods in comment
      sl = list(reviews[i])
      sl[spot] = '.'
      sl[spot+1] = ''
      reviews[i] = "".join(sl)
      spot = reviews[i].find(' .')
    for word in reviews[i].split():
      if word == '.':
        continue
      word_base = word.translate(str.maketrans('', '', string.punctuation))  
      if(bool(spell.unknown([word_base]))):
        recommended = spell.correction(word_base)
        if (recommended in words.words()):
          reviews[i] = reviews[i].replace(word,recommended,1)
        else:
          reviews[i] = reviews[i].replace(word, '')
          reviews[i] = re.sub(r'\s+', ' ',reviews[i]) # Replacing more than one space with a single space
    reviews[i] = reviews[i].replace('..', '.')
    if reviews[i].find('.') == 0:
      reviews[i] = reviews[i].replace('.', '', 1)
      reviews[i] = reviews[i].replace(' ', '', 1)
    comments.append(reviews[i])
    tags.append(labels[i])
  return comments, tags

### Clean the data from a dataset and retrieve lists of the new tag and text data columns.
#### Note: Depending on the length of each text data observation, around 3000-5000 comments are processed per an hour. Removing the spell check aspect from the processing function can reduce processing time if needed.

In [None]:
text = df[data_col]
labels = df[label_col]
text = text.tolist()
text, labels = preprocess_reviews(text, labels)

### This code creates a pandas dataframe out of the cleaned data.

In [None]:
problems_data = pd.DataFrame({"TAG": labels,
                         "REVIEW": text,
                         })
print(problems_data.head(5))

### Indicate the name of the output csv file for this pandas dataframe:

In [None]:
file_out = "/content/gdrive/My Drive/Colab Notebooks/amazon_sentiment_reviews_cleaned.csv"

### Create the csv file:

In [None]:
problems_data.to_csv(file_out, index=False)