In [1]:
import pandas as pd
import numpy as np
import nltk
import re
import string
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.corpus import words
from contractions import CONTRACTION_MAP
from stopwords import stop_words

dat = pd.read_csv('review_ver2.csv', encoding = "ISO-8859-1")

In [2]:
#this function expands words such as I'll to I will
def expand_contractions(word):
    expanded = ' '.join([CONTRACTION_MAP[t] if t in CONTRACTION_MAP else t for t in word.split(" ") ])
    return expanded

In [22]:
#this function gets the wordnet pos tag
def get_wordnet_pos(word):
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ, "N":wordnet.NOUN, "V":wordnet.VERB, "R":wordnet.ADV}
    
    return tag_dict.get(tag,wordnet.NOUN)
    
#this function preprocesses the review texts
def preprocessing_text(text):
    #contractions
    expanded_text=expand_contractions(text)
    #remove numbers
    numbers_removed = re.sub(r'\d+','',expanded_text)
    #remove punctuation
    punct_removed = re.sub(r'[^\w\s]','',numbers_removed)
    #tokenization
    tokens = nltk.word_tokenize(punct_removed.lower())
    
    #remove stop words and lemmatization
    lem_words = []
    lemmatizer = WordNetLemmatizer()
    for word in tokens:
        temp_word = lemmatizer.lemmatize(word,get_wordnet_pos(word))
        if  temp_word not in stop_words:
            lem_words.append(temp_word)

    return lem_words


In [23]:
#this function gets the bigram of the review texts
def get_bigram(lem_words):
    
    if len(lem_words) <= 1: #meaning token is just one, unable to perform bigram
        return lem_words
    
    else:
        #bigram is in the form of [('wordA','wordB'),('wordB,'wordC'),...]
        bigrm = list(nltk.bigrams(lem_words))

        #make the bigram in this format ['wordA wordB','wordB wordC',...]
        bigrm_list = []
        separator = ' '
        for i in range(len(bigrm)):
            bigrm_list.append(separator.join(bigrm[i]))   
        return bigrm_list

In [26]:
#applies the preprocessing_text function on all items in the review column
lem_tokens = dat['review'].apply(preprocessing_text)
#applies the get_bigram function on all the items in the review column
bigram_list = lem_tokens.apply(get_bigram)

hohoho


TypeError: write() argument must be str, not Series

In [41]:
#putting the series of review texts into data frame
df_bigram = bigram_list.to_frame()
#concatenating the new data frame with ratings column
result = pd.concat([df_bigram,dat['rating']],axis=1)

export_csv = result.to_csv('bigram_list.csv', sep=',')