In [1]:
import pandas as pd
import csv
import matplotlib.pyplot as plt
import wikipedia as wiki
import warnings
import time
import requests
from bs4 import BeautifulSoup

# Book Reviews

In [2]:
raw_df_book_reviews = pd.read_csv('data/br.csv', engine='python', encoding='utf-8', usecols=range(9), error_bad_lines=False)
raw_df_book_reviews.head(3)

Skipping line 312075: unexpected end of data


Unnamed: 0,bookID,title,author,rating,ratingsCount,reviewsCount,reviewerName,reviewerRatings,review
0,9,"Unauthorized Harry Potter Book Seven News: ""Ha...",W. Frederick Zimmerman,3.73,22,1,Charles G,3.0,
1,8,"Harry Potter Boxed Set, Books 1-5 (Harry Potte...",J.K. Rowling,4.77,34107,156,✿Katherine Elizabeth✿,5.0,
2,3,Harry Potter and the Sorcerer's Stone,J.K. Rowling,4.44,4911929,77741,Lora,5.0,I'm going to keep this brief since there isn't...


In [3]:
subset_df_book_reviews = raw_df_book_reviews[['reviewerRatings', 'review']]
subset_df_book_reviews.head(3)

Unnamed: 0,reviewerRatings,review
0,3.0,
1,5.0,
2,5.0,I'm going to keep this brief since there isn't...


In [4]:
print(len(subset_df_book_reviews))
xna_df_book_reviews = subset_df_book_reviews.dropna(axis=0)
print(len(xna_df_book_reviews))
xna_df_book_reviews.head(3)

312073
109374


Unnamed: 0,reviewerRatings,review
2,5.0,I'm going to keep this brief since there isn't...
3,5.0,"""Read Harry Potter!"" they said. ""It'll be fun!..."
6,5.0,I do not own this spiffy box set of Harry Pott...


In [5]:
def get_book_review_polarity(rating):
    if rating > 3:
        return 1
    elif rating < 3:
        return 0
    else:
        return 0.5

def get_book_review_intensity(rating):
    try:
        mapping = {1: 1, 2: 0.5, 3: 0, 4: 0.5, 5: 1}
        return mapping[rating]
    except KeyError:
        return 0

def trim_sample(text):
    words = text.split()
    if len(words) > 100:
        return ' '.join(words[:100])
    else:
        return ' '.join(words)

In [6]:
with pd.option_context('mode.chained_assignment', None):
    xna_df_book_reviews['polarity'] = xna_df_book_reviews['reviewerRatings'].apply(get_book_review_polarity)
    xna_df_book_reviews['intensity'] = xna_df_book_reviews['reviewerRatings'].apply(get_book_review_intensity)
    xna_df_book_reviews['text'] = xna_df_book_reviews['review'].apply(trim_sample)
    clean_book_reviews = xna_df_book_reviews.drop(axis=1, labels=['reviewerRatings', 'review'])

In [7]:
len(clean_book_reviews)

109374

In [9]:
neu_book_reviews = clean_book_reviews[clean_book_reviews['polarity'] == 0.5]
remaining = 100000 - len(neu_book_reviews)
pol_book_reviews = clean_book_reviews[clean_book_reviews['polarity'] != 0.5].sample(n=remaining)
clean_book_reviews = pd.concat([neu_book_reviews, pol_book_reviews])

In [10]:
print("Number of records: {}".format(len(clean_book_reviews)))
print("Average length of record: {}".format(clean_book_reviews['text'].str.len().mean()))
print(clean_book_reviews.head(3))

Number of records: 100000
Average length of record: 572.5304
    polarity  intensity                                               text
13       0.5        0.0  Herman Melville’s poetry is an enigma. The man...
69       0.5        0.0  When I started reading Rusdie's Midnight's Chi...
74       0.5        0.0  Fourth installment in this amazing series. You...


# Product Reviews

In [11]:
raw_df_product_reviews = pd.read_csv('data/train.ft.txt', header=None, sep='\n', engine='python')
raw_df_product_reviews = raw_df_product_reviews.sample(n=200000)
raw_df_product_reviews.head()

Unnamed: 0,0
2381691,__label__1 Oh dear...: The one battle where th...
2656174,__label__2 Men's Red Dress Shirt: Love this sh...
1944737,__label__1 A one-trick piggy: Don't get me wro...
2757586,__label__1 Sad Day: Although I asked 3 or 4 ga...
1538068,__label__2 book: This book looks brand new. Ju...


In [12]:
def get_product_review_polarity(raw_text):
    if raw_text[9] == '2':
        return 1
    else:
        return 0

def get_product_review_text(raw_text):
    return raw_text[11:]

raw_df_product_reviews['polarity'] = raw_df_product_reviews[0].apply(get_product_review_polarity)
raw_df_product_reviews['intensity'] = 1
raw_df_product_reviews['text'] = raw_df_product_reviews[0].apply(get_product_review_text).apply(trim_sample)
clean_product_reviews = raw_df_product_reviews.drop(axis=1, labels=0)

In [13]:
pos_pol_product_reviews = clean_product_reviews[clean_product_reviews['polarity'] == 1].sample(n=50000)
neg_pol_product_reviews = clean_product_reviews[clean_product_reviews['polarity'] == 0].sample(n=50000)
clean_product_reviews = pd.concat([pos_pol_product_reviews, neg_pol_product_reviews])

In [14]:
print("Number of records: {}".format(len(clean_product_reviews)))
print("Average length of record: {}".format(clean_product_reviews['text'].str.len().mean()))
print(clean_product_reviews.head(3))

Number of records: 100000
Average length of record: 374.76333
         polarity  intensity  \
24352           1          1   
2563649         1          1   
3348421         1          1   

                                                      text  
24352    Good scrub: I purchased the scrub brush to cle...  
2563649  Great Map of Venice: And sturdy to carry aroun...  
3348421  COMPREHENSIVE AND WELL WRITTEN BOOK ON DAM HYD...  


# Wikipedia Articles

In [11]:
most_common_nouns = """time
person
year
way
day
thing
man
world
life
hand
part
child
eye
woman
place
work
week
case
point
government
company
number
group
problem
fact""".split()
most_common_nouns[:3]

['time', 'person', 'year']

In [23]:
def prepare_samples(text):
    samples = []
    target_length = int(500 / 4.5)
    words = text.split()
    for i in range((len(words) // target_length) - 1):
        samples.append(' '.join(words[(i*target_length):((i+1)*target_length)]))
    return samples

In [29]:
samples = []
errors = 0
successes = 0
def print_status(errors, successes, msg="prev"):
    print("errors: {}\tsuccesses: {}\t\t[finished {}]".format(errors, successes, msg), " "*30, end='')
    print('\r', end='')
    time.sleep(3)

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    for term in most_common_nouns:
        results = wiki.search(term, results=3)
        for result in results:
            try:
                text = wiki.page(result).content
                samples += prepare_samples(text)
                successes += 1
                print_status(errors, successes, result)
            except wiki.exceptions.DisambiguationError:
                errors += 1
                print_status(errors, successes, result)
    for term in wiki.random(pages=100):
        try:
            text = wiki.page(result).content
            samples += prepare_samples(text)
            successes += 1
            print_status(errors, successes, term)
        except wiki.exceptions.DisambiguationError:
            errors += 1
            print_status(errors, successes, term)

errors: 16	successes: 159		[finished Queensland Railways 3900 class]                                                             

In [33]:
new_samples = []
for sample in samples:
    words = sample.split()
    fourth = (len(words) // 4) - 1
    third = (len(words) // 3) - 1
    half = (len(words) // 2) - 1
    new_samples.append(' '.join(words[:fourth]))
    new_samples.append(' '.join(words[fourth:(fourth*2)]))
    new_samples.append(' '.join(words[(fourth*2):(fourth*3)]))
    new_samples.append(' '.join(words[(fourth*3):]))
    new_samples.append(' '.join(words[:third]))
    new_samples.append(' '.join(words[third:(third*2)]))
    new_samples.append(' '.join(words[(third*2):]))
    new_samples.append(' '.join(words[:half]))
    new_samples.append(' '.join(words[half:]))
samples += new_samples

In [78]:
clean_wiki_articles = pd.DataFrame({'text': samples})
clean_wiki_articles = clean_wiki_articles.sample(n=30000)

In [79]:
clean_wiki_articles['intensity'] = 0.0
clean_wiki_articles['polarity'] = 0.5

In [81]:
print("Number of records: {}".format(len(clean_wiki_articles)))
print("Average length of record: {}".format(clean_wiki_articles['text'].str.len().mean()))
print(clean_wiki_articles.head())

Number of records: 30000
Average length of record: 272.0262666666667
                                                    text  intensity  polarity
7628   Thing, from featuring Russell in a starring ro...        0.0       0.5
14279  the full development of complex numbers. The r...        0.0       0.5
715    row (scaphoid, lunate, triquetral and pisiform...        0.0       0.5
17849  as to emphasize a point or prove a disputed is...        0.0       0.5
9662   thumb, the first and second lumbrical. The uln...        0.0       0.5


# Adding Data Together

In [95]:
clean_text = pd.concat([clean_book_reviews, clean_product_reviews, clean_wiki_articles])
print(clean_text.head(3))

        intensity  polarity                                               text
54157         1.0       0.0  My biggest problem with the book is that it is...
279088        1.0       0.0   1 HAVING MY CAKE AND EATING IT STARSWell if y...
38636         1.0       0.0  [image error]It may be unfair, even mean-spiri...


In [96]:
print(clean_text['intensity'].value_counts())
print(clean_text['polarity'].value_counts())

0.0    54060
1.0    43124
0.5     2816
Name: intensity, dtype: int64
0.5    54060
1.0    22970
0.0    22970
Name: polarity, dtype: int64


In [97]:
clean_text = clean_text.sample(frac=1)
clean_text.head()

Unnamed: 0,intensity,polarity,text
18570,0.0,0.5,"true fact, (e.g., ""the author's facts are not ..."
27955,0.0,0.5,the French government has chosen Paris to be t...
1664746,1.0,0.0,KNOCK OFF VERSION! BUY THE GREY ONE!: My boyfr...
2133092,1.0,0.0,Surely this can't be the best one out there?: ...
207087,0.0,0.5,Charles Willeford was one of the last of the h...


In [98]:
clean_text = clean_text.sample(frac=1)
clean_text.to_csv('clean_text.csv', index=False)