In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
%cd gdrive/MyDrive/NLP-Project/GCDC_Corpus_v2/GCDC_rerelease/

In [None]:
%ls 

Clinton_test.csv   Enron_train.csv  Yahoo_train.csv
Clinton_train.csv  README.txt       Yelp_test.csv
Enron_test.csv     Yahoo_test.csv   Yelp_train.csv


In [None]:
import torch
import os
from torch.utils.data import Dataset, DataLoader
import torchtext
from collections import Counter
import numpy as np
import pandas as pd
import pickle

In [None]:
class Tokenizer:
  def __init__(self, file, threshold=5):
    self.file = file
    self.data = pd.read_csv(file)
    self.threshold = threshold
  
  def preprocess(self):
    tokenizer = torchtext.data.utils.get_tokenizer('spacy', language='en')
    tokens = []
    for text in self.data['text'].tolist():
      tokens.append(tokenizer(text))
    counter = Counter()
    for line in tokens:
      for word in line:
        counter[word] +=1
    # print(len(counter.items()), len(counter.most_common()))

    # remove all words that have frequency less than threshold
    # counter_threshold = {k:v for k,v in counter.items() if v >= self.threshold}

    # create mappings
    # mapper = {word:idx+1 for idx,word in enumerate(counter_threshold.keys())}        
    # inverse_mapper = {idx+1:word for idx,word in enumerate(counter_threshold.keys())}

    # sos_idx = len(counter_threshold.keys())
    # eos_idx = len(counter_threshold.keys()) + 1
    # other_idx = len(counter_threshold.keys()) + 2
      
    # mapped_tokens = []
   
    # for line in tokens:
    #     mapped_line = [sos_idx]
    #     for word in line:
    #       # map words to their mappings and to other otherwise
    #         mapped_line.append(mapper.get(word, other_idx))
    #     mapped_line.append(eos_idx)
    #     mapped_tokens.append(mapped_line)

    # inverse_mapper[other_idx] = "__OTHER__"
    # inverse_mapper[sos_idx] = "__SOS__"
    # inverse_mapper[eos_idx] = "__EOS__"
    # inverse_mapper[0] = "__PADDING__"

    mapper = {word[0]:idx+1 for idx,word in enumerate(counter.most_common())}        
    inverse_mapper = {idx+1:word[0] for idx,word in enumerate(counter.most_common())}

    # sos_idx = len(counter_threshold.keys())
    # eos_idx = len(counter_threshold.keys()) + 1
    other_idx = len(counter.keys()) 
      
    mapped_tokens = []
   
    for line in tokens:
        mapped_line = []
        for word in line:
          # map words to their mappings and to other otherwise
            mapped_line.append(mapper.get(word, other_idx))
        mapped_tokens.append(mapped_line)

    # inverse_mapper[other_idx] = "__OTHER__"
    # inverse_mapper[sos_idx] = "__SOS__"
    # inverse_mapper[eos_idx] = "__EOS__"
    # inverse_mapper[0] = "__PADDING__"
    
    return mapped_tokens, inverse_mapper

  def save_tokens(self):
    mapped_tokens, inverse_mapper = self.preprocess()

    self.data.to_pickle(self.file+".pkl")
    with open('mapped_tokens_' + self.file + '.pkl', 'wb') as f:
      pickle.dump(mapped_tokens,f)
    with open('inverse_mapper' + self.file + '.pkl', 'wb') as f:
      pickle.dump(inverse_mapper,f)

    return

    

In [None]:
t1 = Tokenizer('Yelp_train.csv')
t2 = Tokenizer('Yelp_test.csv')

In [None]:
t1.save_tokens()
t2.save_tokens()

In [None]:
mapped_tokens

[[1098,
  481,
  28,
  482,
  34,
  2,
  6264,
  104,
  68,
  150,
  6,
  409,
  25,
  50,
  2422,
  1,
  402,
  6265,
  11,
  26,
  2422,
  2117,
  1,
  6266,
  59,
  12,
  6,
  542,
  34,
  2,
  2787,
  26,
  708,
  4,
  2,
  150,
  10,
  2,
  2422,
  2118,
  44,
  6,
  1099,
  34,
  2,
  4368,
  4,
  778,
  7,
  2,
  482,
  1,
  17,
  1099,
  13,
  83,
  40,
  12,
  16,
  150,
  1,
  21,
  403,
  2,
  249,
  150,
  20,
  728,
  59,
  12,
  2,
  2787,
  26,
  542,
  3,
  599,
  248,
  916,
  4,
  6267,
  53,
  840,
  1380,
  4,
  20,
  52,
  179,
  1099,
  12,
  6,
  294,
  482,
  7,
  394,
  1893,
  20,
  133,
  1,
  21,
  17,
  482,
  13,
  119,
  40,
  12,
  76,
  3399,
  3,
  141,
  3400,
  14,
  2,
  2422,
  11,
  26,
  83,
  76,
  192,
  162,
  439,
  1,
  1719,
  23,
  1584,
  16,
  299,
  3,
  41,
  20,
  229,
  23,
  95,
  52,
  286,
  10,
  10,
  76,
  192,
  34,
  6,
  491,
  45,
  39,
  1,
  9,
  135,
  26,
  841,
  45,
  43,
  527,
  549,
  3,
  6268,
  3,
  6269,
  3,
 

In [None]:
inverse_mapper

{1: '.',
 2: 'the',
 3: ',',
 4: 'and',
 5: 'I',
 6: 'a',
 7: 'to',
 8: 'was',
 9: ' ',
 10: 'of',
 11: 'it',
 12: 'for',
 13: 'is',
 14: 'in',
 15: '!',
 16: 'that',
 17: 'The',
 18: 'with',
 19: 'my',
 20: 'you',
 21: '\n\n',
 22: 'but',
 23: "n't",
 24: 'on',
 25: 'they',
 26: "'s",
 27: 'have',
 28: 'this',
 29: 'not',
 30: 'had',
 31: 'we',
 32: 'were',
 33: 'are',
 34: 'at',
 35: 'so',
 36: '-',
 37: 'be',
 38: 'me',
 39: 'place',
 40: 'good',
 41: 'as',
 42: ')',
 43: '(',
 44: 'out',
 45: 'food',
 46: 'there',
 47: 'very',
 48: '"',
 49: 'time',
 50: 'do',
 51: 'did',
 52: 'get',
 53: 'just',
 54: 'all',
 55: 'here',
 56: 'like',
 57: 'We',
 58: 'It',
 59: 'up',
 60: 'from',
 61: 'would',
 62: 'back',
 63: 'if',
 64: 'or',
 65: 'an',
 66: 'their',
 67: 'our',
 68: 'one',
 69: 'go',
 70: 'when',
 71: 'about',
 72: 'great',
 73: 'They',
 74: 'can',
 75: 'which',
 76: '$',
 77: 'service',
 78: 'been',
 79: 'your',
 80: 'really',
 81: 'will',
 82: '...',
 83: 'only',
 84: 'what',
 

In [None]:
t.data

Unnamed: 0,text_id,subject,text,ratingA1,ratingA2,ratingA3,labelA,ratingM1,ratingM2,ratingM3,ratingM4,ratingM5,labelM
0,FUYQ99EUHg2TOHMMTy7cFQ,Seasons Buffet,Most months this buffet at the Silverton has o...,3,2,3,3,3,2,3,2,1,2
1,8Mkhxopb8jzL748V-oCVKg,The Cleaning Authority - Phoenix,"Awful, I should have looked at yelp. I receiv...",2,3,2,3,2,3,2,3,2,3
2,db7i0JVq9s_AKVvOoHBNxQ,Bad Owl Coffee,"I don't know how I found this place on Yelp, b...",2,3,3,3,3,3,2,2,3,3
3,9HebYLRstsuqti5_7P0hEg,Moe's Southwest Grill,"I love Moe's. So much better than Chipotle, IM...",2,3,3,3,2,3,3,2,3,3
4,7KgfshQrZ9O9iMIABtm59Q,Nirvana the Flavors of India,"Visiting from the US, I was looking for a some...",3,2,2,3,2,3,3,1,2,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,9ISZhCjisjmIpINkfVDkHg,Target,I always go to this target since I live down t...,3,1,1,1,2,2,1,2,3,2
996,-grSvaH5qWD3yn4NiLT3tg,Malones's Bakeries,Malones is a bakers located on Slateford Road ...,1,2,2,1,2,2,2,1,2,1
997,RF7zy-XMzDsUaCa5zq33Vg,Ernie's Restaurant & Bar,Seven of us went to Ernie's for a Christmas ka...,2,3,2,3,2,1,2,3,2,2
998,KiLwmdDe_PDPdov8coneBA,Urban Taco,Omg the food here is sooooooo delish! The mea...,2,1,2,1,2,3,2,2,3,3
