# sentiment analyis project

In [1]:
from google.colab import drive

drive.mount('/content/gdrive/')

Drive already mounted at /content/gdrive/; to attempt to forcibly remount, call drive.mount("/content/gdrive/", force_remount=True).


In [0]:
root_path = '/content/gdrive/My Drive/sentiment_analysis/'

## Preprocess and prepare a vocabulary

In [0]:
# Imports 
import pandas as pd
from matplotlib import pyplot as plt
from collections import Counter
import numpy as np
import plotly.offline as pyo
from string import punctuation


In [4]:
# Reading the data to a csv
# Reading the data from csv
train_csv = pd.read_csv(root_path+'dataset/train.csv')
print(len(train_csv.text))

5279


In [5]:
punctuation


'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [0]:
reviews = ' '.join(list(train_csv.text))


In [7]:
type(reviews)

str

In [8]:
reviews = reviews.lower() # lowercase, standardize
print(reviews[0:100])
all_text = ''.join([c for c in reviews if c not in punctuation])

# split by new lines and spaces
reviews_split = all_text.split('\n')
all_text = ' '.join(reviews_split)

# create a list of words
words = all_text.split()
words[0:100]

autoimmune diseases tend to come in clusters. as for gilenya – if you feel good, don’t think about i


['autoimmune',
 'diseases',
 'tend',
 'to',
 'come',
 'in',
 'clusters',
 'as',
 'for',
 'gilenya',
 '–',
 'if',
 'you',
 'feel',
 'good',
 'don’t',
 'think',
 'about',
 'it',
 'it',
 'won’t',
 'change',
 'anything',
 'but',
 'waste',
 'your',
 'time',
 'and',
 'energy',
 'i’m',
 'taking',
 'tysabri',
 'and',
 'feel',
 'amazing',
 'no',
 'symptoms',
 'other',
 'than',
 'dodgy',
 'color',
 'vision',
 'but',
 'i’ve',
 'had',
 'it',
 'since',
 'always',
 'so',
 'don’t',
 'know',
 'and',
 'i',
 'don’t',
 'know',
 'if',
 'it',
 'will',
 'last',
 'a',
 'month',
 'a',
 'year',
 'a',
 'decade',
 'ive',
 'just',
 'decided',
 'to',
 'enjoy',
 'the',
 'ride',
 'no',
 'point',
 'in',
 'worrying',
 'i',
 'can',
 'completely',
 'understand',
 'why',
 'you’d',
 'want',
 'to',
 'try',
 'it',
 'but',
 'results',
 'reported',
 'in',
 'lectures',
 'don’t',
 'always',
 'stand',
 'up',
 'to',
 'the',
 'scrutiny',
 'of',
 'peerreview']

In [9]:
# imports for nltk stop words
# Load library
from nltk.corpus import stopwords

# You will have to download the set of stop words the first time
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [0]:
new_words = [word for word in words if word not in stopwords.words('english')]


In [11]:
len(new_words)

1060060

In [0]:
vocabulary_latest = [word for word in new_words if len(word) > 2]

In [13]:
len(vocabulary_latest)

999239

In [14]:
vocabulary_latest[0:100]

['autoimmune',
 'diseases',
 'tend',
 'come',
 'clusters',
 'gilenya',
 'feel',
 'good',
 'don’t',
 'think',
 'won’t',
 'change',
 'anything',
 'waste',
 'time',
 'energy',
 'i’m',
 'taking',
 'tysabri',
 'feel',
 'amazing',
 'symptoms',
 'dodgy',
 'color',
 'vision',
 'i’ve',
 'since',
 'always',
 'don’t',
 'know',
 'don’t',
 'know',
 'last',
 'month',
 'year',
 'decade',
 'ive',
 'decided',
 'enjoy',
 'ride',
 'point',
 'worrying',
 'completely',
 'understand',
 'you’d',
 'want',
 'try',
 'results',
 'reported',
 'lectures',
 'don’t',
 'always',
 'stand',
 'scrutiny',
 'peerreview',
 'publication',
 'much',
 'still',
 'convincing',
 'hope',
 'work',
 'really',
 'you’re',
 'aware',
 'happy',
 'risks',
 'that’s',
 'great',
 'think',
 'it’s',
 'important',
 'present',
 'balanced',
 'way',
 'understand',
 'don’t',
 'move',
 'straight',
 'first',
 'show',
 'promise',
 'animal',
 'study',
 'using',
 'drugs',
 'humans',
 'there’s',
 'still',
 'lot',
 'animal',
 'data',
 'gather',
 'human',


# pickling the progress

In [0]:
import pickle

In [0]:
complete_vocabulary = open(root_path + 'pickle/complete_vocabulary.pickle','wb')
vocabulary_g3 = open(root_path + 'pickle/vocabulary_g3.pickle','wb')

pickle.dump(new_words, complete_vocabulary)
pickle.dump(vocabulary_latest, vocabulary_g3)

complete_vocabulary.close()
vocabulary_g3.close()

In [0]:
file = open(root_path + 'pickle/complete_vocabulary.pickle','rb')
words = pickle.load(file)

## Tokenizing the words

In [0]:
# feel free to use this import 
from collections import Counter

## Build a dictionary that maps words to integers
counts = Counter(new_words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}
int_to_vocab = {ii: word for ii, word in enumerate(vocab, 1)}


In [0]:
vocab_to_int_pickle = open(root_path  + 'pickle/vocab_to_int.pickle','wb')
int_to_vocab_pickle = open(root_path + 'pickle/int_to_vocab.pickle','wb')

pickle.dump(vocab_to_int,vocab_to_int_pickle)
pickle.dump(int_to_vocab,int_to_vocab_pickle)

vocab_to_int_pickle.close()
int_to_vocab_pickle.close()

In [0]:
import json
# json.dumps(vocab_to_int)

# Functions to tokenize the raw review and label

In [0]:
def tokenize_review(review):
    review = ''.join([c for c in review if c not in punctuation])
    tokens = []
    for word in review.split(' '):
      
        try:
            token = vocab_to_int[word.lower()]
        except KeyError:
            token = 0
        tokens.append(token)
    return tokens

In [37]:
tokenize_review(list(train_csv.text)[0])

[965,
 418,
 1934,
 0,
 307,
 0,
 9972,
 0,
 0,
 92,
 68,
 0,
 0,
 128,
 41,
 207,
 79,
 0,
 0,
 0,
 1577,
 229,
 375,
 0,
 4864,
 0,
 15,
 0,
 1168,
 73,
 69,
 198,
 0,
 128,
 1578,
 0,
 53,
 0,
 0,
 11641,
 2140,
 243,
 0,
 247,
 0,
 0,
 59,
 293,
 0,
 207,
 39,
 0,
 0,
 207,
 39,
 0,
 0,
 0,
 78,
 0,
 272,
 0,
 47,
 0,
 3523,
 277,
 0,
 946,
 0,
 2442,
 0,
 3251,
 0,
 419,
 0,
 4255]

In [0]:
def tokenize_setiment(label):
  if label == '0' or label == 0:
    return [0, 0, 0]
  elif label == '1' or label == 1:
    return [0, 1, 0]
  elif label == '2' or label == 2:
    return [0, 0, 1]
  else:
    return None

In [53]:
tokenize_setiment(list(train_csv.sentiment)[2000])

[0, 0, 1]