In [302]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.tokenize import WordPunctTokenizer
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re
from replacers import RegexReplacer

# init function

In [303]:
# remove HTML tag pattern
rc = re.compile(r"\<.*?\>")  
# Replacer class
replacer = RegexReplacer()
# split sentence into word
pattern = r'[\d.,]+|[A-Z][.A-Z]+\b\.*|\w+|\S'
tokenizer = RegexpTokenizer(pattern)
# Lemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

# Row data path

In [304]:
LabelTrainDataPath = "RowData\\labeledTrainData.tsv"
unLabelTrainDataPath = "RowData\\unlabeledTrainData.tsv"
testDataPath = "RowData\\testData.tsv"
LabelTrainDataFrame = pd.read_csv(LabelTrainDataPath, sep='\t',quoting=3).head(10)

In [305]:
LabelTrainDataFrame

Unnamed: 0,id,sentiment,review
0,"""5814_8""",1,"""With all this stuff going down at the moment ..."
1,"""2381_9""",1,"""\""The Classic War of the Worlds\"" by Timothy ..."
2,"""7759_3""",0,"""The film starts with a manager (Nicholas Bell..."
3,"""3630_4""",0,"""It must be assumed that those who praised thi..."
4,"""9495_8""",1,"""Superbly trashy and wondrously unpretentious ..."
5,"""8196_8""",1,"""I dont know why people think this is such a b..."
6,"""7166_2""",0,"""This movie could have been very good, but com..."
7,"""10633_1""",0,"""I watched this video at a friend's house. I'm..."
8,"""319_1""",0,"""A friend of mine bought this film for £1, and..."
9,"""8713_10""",1,"""<br /><br />This movie is full of references...."


# define apply function

In [306]:
def SplitPhase(row):
    """ split paragraph to sentence """
    return PunktTokenizer.tokenize(row['review'])

In [307]:
def RemoveHTML(row):
    """ remove HTML tags """
    return [rc.sub('',sentence) for sentence in row['review']]

In [308]:
def ReplaceAbbre(row):
    """ Replace abbreviation """
    return [replacer.replace(sentence) for sentence in row['review']]

In [309]:
def SplitSent(row):
    """ split sentence to words """
    return [tokenizer.tokenize(sentence) for sentence in row['review']]

In [310]:
def lemma(tags):
    WORD = []
    for word, tag in tags:
         wntag = tag[0].lower()
         wntag = wntag if wntag in ['a', 'r', 'n', 'v','n','s'] else None
         if not wntag:
            lemma = word
         else:
            lemma = wordnet_lemmatizer.lemmatize(word, wntag)
         WORD.append(lemma)
    return WORD

In [311]:
def Lemmatizer(row):
    """ Lemmatizer words use WordNet """
    return [ lemma(nltk.pos_tag(sentence)) for sentence in row['review']]

In [312]:
def CleanWords(sentence):
    return [word.lower() for word in sentence if len(word)>=2 and (word.isalpha() or word.isdigit())]

In [313]:
def CleanSentences(row):
    return [CleanWords(sentence) for sentence in row['review']]

In [314]:
def ToStr(row):
    str=""
    for sentence in row['review']:
        for word in sentence:
            str += (word + " ")
    return str[:-1]

# apply to dataframe

## split into sentence

In [315]:
PunktTokenizer = nltk.tokenize.punkt.PunktSentenceTokenizer()
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(SplitPhase, axis=1)

In [316]:
for i in LabelTrainDataFrame['review']:
    print(i)
    break

['"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again.', 'Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent.', 'Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released.', "Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.<br /><br />Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring.", 'Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.<br /><br />The actual feature film bit when

## Remove HTMLtags

In [317]:
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(RemoveHTML, axis=1)

In [318]:
for i in LabelTrainDataFrame['review']:
    print(i)
    break

['"With all this stuff going down at the moment with MJ i\'ve started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again.', 'Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent.', 'Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released.', "Some of it has subtle messages about MJ's feeling towards the press and also the obvious message of drugs are bad m'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring.", 'Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is on

## replace abbreviation

In [319]:
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(ReplaceAbbre, axis=1)

In [320]:
for i in LabelTrainDataFrame['review']:
    print(i)
    break

['"With all this stuff going down at the moment with MJ i have started listening to his music, watching the odd documentary here and there, watched The Wiz and watched Moonwalker again.', 'Maybe i just want to get a certain insight into this guy who i thought was really cool in the eighties just to maybe make up my mind whether he is guilty or innocent.', 'Moonwalker is part biography, part feature film which i remember going to see at the cinema when it was originally released.', "Some of it has subtle messages about MJ is feeling towards the press and also the obvious message of drugs are bad m'kay.Visually impressive but of course this is all about Michael Jackson so unless you remotely like MJ in anyway then you are going to hate this and find it boring.", 'Some may call MJ an egotist for consenting to the making of this movie BUT MJ and most of his fans would say that he made it for the fans which if true is really nice of him.The actual feature film bit when it finally starts is 

## split into words

In [321]:
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(SplitSent, axis=1)

In [322]:
for i in LabelTrainDataFrame['review']:
    print(i)
    break

[['"', 'With', 'all', 'this', 'stuff', 'going', 'down', 'at', 'the', 'moment', 'with', 'MJ', 'i', 'have', 'started', 'listening', 'to', 'his', 'music', ',', 'watching', 'the', 'odd', 'documentary', 'here', 'and', 'there', ',', 'watched', 'The', 'Wiz', 'and', 'watched', 'Moonwalker', 'again', '.'], ['Maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'thought', 'was', 'really', 'cool', 'in', 'the', 'eighties', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'is', 'guilty', 'or', 'innocent', '.'], ['Moonwalker', 'is', 'part', 'biography', ',', 'part', 'feature', 'film', 'which', 'i', 'remember', 'going', 'to', 'see', 'at', 'the', 'cinema', 'when', 'it', 'was', 'originally', 'released', '.'], ['Some', 'of', 'it', 'has', 'subtle', 'messages', 'about', 'MJ', 'is', 'feeling', 'towards', 'the', 'press', 'and', 'also', 'the', 'obvious', 'message', 'of', 'drugs', 'are', 'bad', 'm', "'", 'kay', '.', 'Visually', 'impressive'

## Lemmatizer

In [323]:
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(Lemmatizer, axis=1)
for i in LabelTrainDataFrame['review']:
    print(i)
    break

[['"', 'With', 'all', 'this', 'stuff', 'go', 'down', 'at', 'the', 'moment', 'with', 'MJ', 'i', 'have', 'start', 'listen', 'to', 'his', 'music', ',', 'watch', 'the', 'odd', 'documentary', 'here', 'and', 'there', ',', 'watch', 'The', 'Wiz', 'and', 'watch', 'Moonwalker', 'again', '.'], ['Maybe', 'i', 'just', 'want', 'to', 'get', 'a', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'i', 'think', 'be', 'really', 'cool', 'in', 'the', 'eighty', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'be', 'guilty', 'or', 'innocent', '.'], ['Moonwalker', 'be', 'part', 'biography', ',', 'part', 'feature', 'film', 'which', 'i', 'remember', 'go', 'to', 'see', 'at', 'the', 'cinema', 'when', 'it', 'be', 'originally', 'release', '.'], ['Some', 'of', 'it', 'have', 'subtle', 'message', 'about', 'MJ', 'be', 'feel', 'towards', 'the', 'press', 'and', 'also', 'the', 'obvious', 'message', 'of', 'drug', 'be', 'bad', 'm', "'", 'kay', '.', 'Visually', 'impressive', 'but', 'of', 'course', 'this

## Clean words, if word2vector, not  remove stop words

In [324]:
LabelTrainDataFrame['review'] = LabelTrainDataFrame.apply(CleanSentences, axis=1)
for i in LabelTrainDataFrame['review']:
    print(i)
    break

[['with', 'all', 'this', 'stuff', 'go', 'down', 'at', 'the', 'moment', 'with', 'mj', 'have', 'start', 'listen', 'to', 'his', 'music', 'watch', 'the', 'odd', 'documentary', 'here', 'and', 'there', 'watch', 'the', 'wiz', 'and', 'watch', 'moonwalker', 'again'], ['maybe', 'just', 'want', 'to', 'get', 'certain', 'insight', 'into', 'this', 'guy', 'who', 'think', 'be', 'really', 'cool', 'in', 'the', 'eighty', 'just', 'to', 'maybe', 'make', 'up', 'my', 'mind', 'whether', 'he', 'be', 'guilty', 'or', 'innocent'], ['moonwalker', 'be', 'part', 'biography', 'part', 'feature', 'film', 'which', 'remember', 'go', 'to', 'see', 'at', 'the', 'cinema', 'when', 'it', 'be', 'originally', 'release'], ['some', 'of', 'it', 'have', 'subtle', 'message', 'about', 'mj', 'be', 'feel', 'towards', 'the', 'press', 'and', 'also', 'the', 'obvious', 'message', 'of', 'drug', 'be', 'bad', 'kay', 'visually', 'impressive', 'but', 'of', 'course', 'this', 'be', 'all', 'about', 'michael', 'jackson', 'so', 'unless', 'you', 'remo

# convert to str

In [325]:
LabelTrainDataFrame['review']=LabelTrainDataFrame.apply(ToStr, axis=1)
for i in LabelTrainDataFrame['review']:
    print(i)
    break

with all this stuff go down at the moment with mj have start listen to his music watch the odd documentary here and there watch the wiz and watch moonwalker again maybe just want to get certain insight into this guy who think be really cool in the eighty just to maybe make up my mind whether he be guilty or innocent moonwalker be part biography part feature film which remember go to see at the cinema when it be originally release some of it have subtle message about mj be feel towards the press and also the obvious message of drug be bad kay visually impressive but of course this be all about michael jackson so unless you remotely like mj in anyway then you be go to hate this and find it bore some may call mj an egotist for consent to the making of this movie but mj and most of his fan would say that he make it for the fan which if true be really nice of him the actual feature film bit when it finally start be only on for 20 minute or so exclude the smooth criminal sequence and joe pes

## save to txt

In [335]:
a.to_csv("test.txt",index=False)

In [342]:
DataPath = "test.txt"
DataFrame = pd.read_csv(DataPath,header=None)