In [24]:
import csv
import matplotlib.pyplot as plt
import pandas as pd
import requests
import time
import warnings
import wikipedia as wiki

from bs4 import BeautifulSoup
from nltk import tokenize

In [27]:
def sentence_splitter(unsplit_df, include_original=True):
    new_df = {'text':[], 'polarity':[], 'intensity':[]}
    for text, polarity, intensity in zip(unsplit_df['text'], unsplit_df['polarity'], unsplit_df['intensity']):
        for sent in tokenize.sent_tokenize(text):
            new_df['text'].append(sent)
            new_df['polarity'].append(polarity)
            new_df['intensity'].append(intensity)
        if include_original:
            new_df['text'].append(text)
            new_df['polarity'].append(polarity)
            new_df['intensity'].append(intensity)
    return pd.DataFrame(new_df)

# Book Reviews

In [28]:
raw_df_book_reviews = pd.read_csv('data/br.csv', engine='python', encoding='utf-8', usecols=range(9), error_bad_lines=False)
raw_df_book_reviews.head(3)

Skipping line 312075: unexpected end of data


Unnamed: 0,bookID,title,author,rating,ratingsCount,reviewsCount,reviewerName,reviewerRatings,review
0,9,"Unauthorized Harry Potter Book Seven News: ""Ha...",W. Frederick Zimmerman,3.73,22,1,Charles G,3.0,
1,8,"Harry Potter Boxed Set, Books 1-5 (Harry Potte...",J.K. Rowling,4.77,34107,156,✿Katherine Elizabeth✿,5.0,
2,3,Harry Potter and the Sorcerer's Stone,J.K. Rowling,4.44,4911929,77741,Lora,5.0,I'm going to keep this brief since there isn't...


In [29]:
subset_df_book_reviews = raw_df_book_reviews[['reviewerRatings', 'review']]
subset_df_book_reviews.head(3)

Unnamed: 0,reviewerRatings,review
0,3.0,
1,5.0,
2,5.0,I'm going to keep this brief since there isn't...


In [30]:
print(len(subset_df_book_reviews))
xna_df_book_reviews = subset_df_book_reviews.dropna(axis=0)
print(len(xna_df_book_reviews))
xna_df_book_reviews.head(3)
xna_df_book_reviews['reviewerRatings'].value_counts()

312073
109374


4.0    40942
5.0    38660
3.0    24060
1.0     5712
Name: reviewerRatings, dtype: int64

In [31]:
def get_book_review_polarity(rating):
    if rating > 3:
        return 1
    elif rating <= 3:
        return 0

def get_book_review_intensity(rating):
    try:
        mapping = {1: 1, 2: 0.5, 3: 0, 4: 0.5, 5: 1}
        return mapping[rating]
    except KeyError:
        return 0

def trim_sample(text):
    words = text.split()
    if len(words) > 100:
        return ' '.join(words[:100])
    else:
        return ' '.join(words)

In [32]:
with pd.option_context('mode.chained_assignment', None):
    xna_df_book_reviews['polarity'] = xna_df_book_reviews['reviewerRatings'].apply(get_book_review_polarity)
    xna_df_book_reviews['intensity'] = xna_df_book_reviews['reviewerRatings'].apply(get_book_review_intensity)
    xna_df_book_reviews['text'] = xna_df_book_reviews['review'].apply(trim_sample)
    clean_book_reviews = xna_df_book_reviews.drop(axis=1, labels=['reviewerRatings', 'review'])

In [33]:
clean_book_reviews['polarity'].value_counts()

1    79602
0    29772
Name: polarity, dtype: int64

In [36]:
clean_book_reviews = sentence_splitter(clean_book_reviews)
clean_book_reviews['polarity'].value_counts()

1    475387
0    181220
Name: polarity, dtype: int64

In [50]:
pos_book_reviews = clean_book_reviews[clean_book_reviews['polarity'] == 1].sample(n=50000)
neg_book_reviews = clean_book_reviews[clean_book_reviews['polarity'] == 0].sample(n=50000)
clean_book_reviews = pd.concat([pos_book_reviews, neg_book_reviews])

In [51]:
print("Number of records: {}".format(len(clean_book_reviews)))
print("Average length of record: {}".format(clean_book_reviews['text'].str.len().mean()))
print(clean_book_reviews.head(3))

Number of records: 100000
Average length of record: 188.59861
                                                     text  polarity  intensity
234618  I understand why it is, but I think those who ...         1        0.5
345664  'Misery' is a gruesome story of torture with b...         1        1.0
224720  The story concentrates on the childhood of Rob...         1        0.5


# Product Reviews

In [39]:
raw_df_product_reviews = pd.read_csv('data/train.ft.txt', header=None, sep='\n', engine='python')
raw_df_product_reviews = raw_df_product_reviews.sample(n=200000)
raw_df_product_reviews.head()

Unnamed: 0,0
2890773,__label__1 Garbage: I am extremely dissapointe...
361494,__label__1 Ultraviolet copy is a scam!!! Not i...
4459,__label__1 NOT WATER PROOF: Don't waste your m...
1531757,__label__1 An awful misinterpretation of Herbi...
3336509,__label__2 The Bourne Trilogy Blu-Ray!: The Bo...


In [40]:
def get_product_review_polarity(raw_text):
    if raw_text[9] == '2':
        return 1
    else:
        return 0

def get_product_review_text(raw_text):
    return raw_text[11:]

raw_df_product_reviews['polarity'] = raw_df_product_reviews[0].apply(get_product_review_polarity)
raw_df_product_reviews['intensity'] = 1
raw_df_product_reviews['text'] = raw_df_product_reviews[0].apply(get_product_review_text).apply(trim_sample)
clean_product_reviews = raw_df_product_reviews.drop(axis=1, labels=0)

In [42]:
clean_product_reviews['polarity'].value_counts()

0    100455
1     99545
Name: polarity, dtype: int64

In [43]:
clean_product_reviews = sentence_splitter(clean_product_reviews)
clean_product_reviews['polarity'].value_counts()

0    564688
1    533647
Name: polarity, dtype: int64

In [52]:
pos_pol_product_reviews = clean_product_reviews[clean_product_reviews['polarity'] == 1].sample(n=50000)
neg_pol_product_reviews = clean_product_reviews[clean_product_reviews['polarity'] == 0].sample(n=50000)
clean_product_reviews = pd.concat([pos_pol_product_reviews, neg_pol_product_reviews])

In [53]:
print("Number of records: {}".format(len(clean_product_reviews)))
print("Average length of record: {}".format(clean_product_reviews['text'].str.len().mean()))
print(clean_product_reviews.head(3))

Number of records: 100000
Average length of record: 136.34723
                                                     text  polarity  intensity
653926  I wanted to learn PHP so that I could start pr...         1          1
457574  If you have or are planning on having a surrou...         1          1
251531  NOTE: While a three novel book, the volume is ...         1          1


# Medium

In [54]:
raw_medium_articles_list = []
with open('data/medium_samples.csv') as medium_file:
    raw_medium_articles_list = medium_file.readlines()
raw_medium_articles_list = [article.strip() for article in raw_medium_articles_list]
raw_medium_articles = pd.DataFrame({'text': raw_medium_articles_list})
raw_medium_articles.head(3)

Unnamed: 0,text
0,The big lesson here is that it’s not enough to...
1,Restaurant menus come in all shapes and sizes....
2,"Raja Ramachandran, CEO of ripe.io identified c..."


In [59]:
raw_medium_articles['polarity'] = 0.5
raw_medium_articles['intensity'] = 0.0
clean_medium_articles = raw_medium_articles

In [60]:
clean_medium_articles['polarity'].value_counts()

0.5    20518
Name: polarity, dtype: int64

In [61]:
clean_medium_articles = sentence_splitter(clean_medium_articles)
clean_medium_articles['polarity'].value_counts()

0.5    128478
Name: polarity, dtype: int64

In [62]:
clean_medium_articles = clean_medium_articles.sample(n=100000)

In [63]:
print("Number of records: {}".format(len(clean_medium_articles)))
print("Average length of record: {}".format(clean_medium_articles['text'].str.len().mean()))
print(clean_medium_articles.head(3))

Number of records: 100000
Average length of record: 183.69261
                                                     text  polarity  intensity
106152  Jane had been so young when he died and the fu...       0.5        0.0
45696   You’ll see size-2 girls wearing microscopic sh...       0.5        0.0
25508   Annapurna Base Camp Trekking is the mixture of...       0.5        0.0


# Wikipedia Articles

In [64]:
raw_wiki_articles_list = []
with open('data/wikipedia_samples.csv') as wiki_file:
    raw_wiki_articles_list = wiki_file.readlines()
raw_wiki_articles_list = [article.strip() for article in raw_wiki_articles_list]
raw_wiki_articles = pd.DataFrame({'text': raw_wiki_articles_list})
raw_wiki_articles.head(3)

Unnamed: 0,text
0,T (named tee ) is the 20th letter in the moder...
1,⟨t⟩ usually denotes the voiceless alveolar plo...
2,Thomas and thyme.) === Other languages === In ...


In [67]:
raw_wiki_articles['polarity'] = 0.5
raw_wiki_articles['intensity'] = 0.0
clean_wiki_articles = raw_wiki_articles

In [68]:
clean_wiki_articles['polarity'].value_counts()

0.5    27080
Name: polarity, dtype: int64

In [69]:
clean_wiki_articles = sentence_splitter(clean_wiki_articles)
clean_wiki_articles['polarity'].value_counts()

0.5    150409
Name: polarity, dtype: int64

In [70]:
clean_wiki_articles = clean_wiki_articles.sample(n=100000)
print("Number of records: {}".format(len(clean_wiki_articles)))
print("Average length of record: {}".format(clean_wiki_articles['text'].str.len().mean()))
print(clean_wiki_articles.head(3))

Number of records: 100000
Average length of record: 209.50964
                                                     text  polarity  intensity
12690   }=\hbar \left({\frac {\omega }{c}},{\vec {k}}\...       0.5        0.0
67827   letter small reversed epsilon / open e ɞ : Lat...       0.5        0.0
143899                                          In Dutch,       0.5        0.0


# Adding Data Together

In [80]:
clean_text = pd.concat([clean_book_reviews, clean_product_reviews, clean_medium_articles, clean_wiki_articles])
clean_text.head(3)

Unnamed: 0_level_0,text,polarity,intensity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
234618,"I understand why it is, but I think those who ...",1.0,0.5
345664,'Misery' is a gruesome story of torture with b...,1.0,1.0
224720,The story concentrates on the childhood of Rob...,1.0,0.5


In [81]:
print(clean_text['intensity'].value_counts())
print(clean_text['polarity'].value_counts())

0.0    239626
1.0    135071
0.5     25303
Name: intensity, dtype: int64
0.5    200000
0.0    100000
1.0    100000
Name: polarity, dtype: int64


In [84]:
clean_text = clean_text.sample(frac=1).reset_index(drop=True)
clean_text.index.name = 'id'
clean_text.head()

Unnamed: 0_level_0,text,polarity,intensity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,The above procedure describes not only the gro...,0.5,0.0
1,"After he won the election, they began printing...",0.5,0.0
2,"She handled the novel professionally, and as a...",0.0,0.0
3,"If n is a positive integer, an nth primitive r...",0.5,0.0
4,The DVD is stuck inside and we can not get it ...,0.0,1.0


In [85]:
clean_text.to_csv('data/clean_text.csv')

In [None]:
clean_text['text'].loc()