In [1]:
import pandas as pd
import markovify 
import spacy
import re

import warnings
warnings.filterwarnings('ignore')

from time import time
import gc

In [2]:
df1 = pd.read_csv('CommentsJan2017.csv')
df2 = pd.read_csv('CommentsFeb2018.csv')
df3 = pd.read_csv('CommentsMarch2017.csv')
df4 = pd.read_csv('CommentsApril2018.csv')
df5 = pd.read_csv('CommentsMay2017.csv')
comments = pd.concat([df1, df2, df3, df4, df5])
comments.drop_duplicates(subset='commentID', inplace=True)

In [3]:
comments.head(3)

Unnamed: 0,approveDate,articleID,articleWordCount,commentBody,commentID,commentSequence,commentTitle,commentType,createDate,depth,...,status,timespeople,trusted,typeOfMaterial,updateDate,userDisplayName,userID,userLocation,userTitle,userURL
0,1483455908,58691a5795d0e039260788b9,1324.0,For all you Americans out there --- still rejo...,20969730.0,20969730.0,<br/>,comment,1483426105,1.0,...,approved,1,0,News,1483455908,N. Smith,64679318.0,New York City,,
1,1483455656,58691a5795d0e039260788b9,1324.0,Obamas policies may prove to be the least of t...,20969325.0,20969325.0,<br/>,comment,1483417407,1.0,...,approved,1,0,News,1483455656,Kilocharlie,69254188.0,Phoenix,,
2,1483455655,58691a5795d0e039260788b9,1324.0,Democrats are comprised of malcontents who gen...,20969855.0,20969855.0,<br/>,comment,1483431433,1.0,...,approved,1,0,News,1483455655,Frank Fryer,76788711.0,Florida,,


In [4]:
comments.shape

(1248024, 34)

In [5]:
comments.sectionName.value_counts()[:30]

Unknown                  735616
Politics                 282375
Sunday Review             85761
Europe                    29395
Middle East               17127
Asia Pacific              15672
Family                     9475
Television                 9435
Media                      9354
Live                       6204
Americas                   6020
Economy                    6011
Move                       4032
Eat                        3264
Olympics                   2810
DealBook                   2506
Art & Design               2417
Baseball                   1946
Canada                     1930
Pro Football               1625
Mind                       1442
Book Review                1296
Africa                     1278
Energy & Environment       1264
Room For Debate             987
Education Life              876
Music                       809
College Basketball          710
Opinion | Politics          683
Soccer                      669
Name: sectionName, dtype: int64

In [8]:
def preprocess(comments):
    commentBody = comments.loc[comments.sectionName=='Europe', 'commentBody']
    commentBody = commentBody.str.replace("(<br/>)", "")
    commentBody = commentBody.str.replace('(<a).*(>).*(</a>)', '')
    commentBody = commentBody.str.replace('(&amp)', '')
    commentBody = commentBody.str.replace('(&gt)', '')
    commentBody = commentBody.str.replace('(&lt)', '')
    commentBody = commentBody.str.replace('(\xa0)', ' ')  
    return commentBody

In [9]:
commentBody = preprocess(comments)
commentBody.shape

(29395,)

In [10]:
del comments, df1, df2, df3, df4, df5
gc.collect()

545

In [11]:
commentBody.sample().values[0]

"Let's deal with '5 years from now' tomorrow. Maybe we can enjoy just this one day of good news. It's been a long time coming. "

In [12]:
start_time = time()
comments_generator = markovify.Text(commentBody, state_size = 5)
print("Run time for training the generator : {} seconds".format(round(time()-start_time, 2)))

Run time for training the generator : 8.02 seconds


In [13]:
# Print randomly-generated comments using the built model
def generate_comments(generator, number=10, short=False):
    count = 0
    while count < number:
        if short:
            comment = generator.make_short_sentence(140)
        else:
            comment = generator.make_sentence()
        if comment:
            count += 1
            print("Comment {}".format(count))
            print(comment)
            print()
    

In [27]:
generate_comments(comments_generator)


Comment 1
I am ready for it I want something better in this country instead of letting money do all the talking.

Comment 2
Amazon: Price to earnings ratio about 235.Netflix: Price to earnings ratio about 235.Netflix: Price to earnings ratio about 235.Netflix: Price to earnings ratio about 30.Tesla: No earnings to talk about.

Comment 3
It is an excuse for tax cuts for the rich have preceded both the 1929 and 2007 depressions.

Comment 4
Obama inherited an economy that was losing 800,000 jobs a month and it has consistently been gaining 200,000 jobs a month.

Comment 5
Look at California where there the renewable energy sector is one of the fastest growing small biz and job categories in the nation.

Comment 6
Of course Trump will act like another one of those rich white guys who was born on third base thinks he hit a triple.

Comment 7
Despite corrections and bear markets, the stock market goes up the rich, and only the rich get richer.

Comment 8
I don't know, I pay the highest taxes

In [15]:
nlp = spacy.load('en_core_web_sm')

class POSifiedText(markovify.Text):
    def word_split(self, sentence):
        return ["::".join((word.orth_, word.pos_)) for word in nlp(sentence)]

    def word_join(self, words):
        sentence = " ".join(word.split("::")[0] for word in words)
        return sentence

In [17]:
df9= pd.read_csv('CommentsApril2018.csv')
commentBody = preprocess(df9)
commentBody.shape

(4414,)

In [18]:
del comments_generator, df9
gc.collect()

237

In [19]:
start_time = time()
comments_generator_POSified = POSifiedText(commentBody, state_size = 2)
print("Run time for training the generator : {} seconds".format(round(time()-start_time, 2)))

Run time for training the generator : 441.3 seconds


In [21]:
generate_comments(comments_generator_POSified)

Comment 1
You can always preach on the left .

Comment 2
I think he knows the influence of big money over our national parks for nothing , nothing for these white , but there are other options offered -   and chemtrails and gmo intake will that to Trump — Birds of a hospital that would never regenerate due to her own with former Eastern Bloc .

Comment 3
Very rarely are women given the outrage seems to have an incredibly stupid invasion and anyone who offers less than 50 % of his family , are the rule of law by our * in - chief .

Comment 4
Tough to be replaced by yet another flip flop position , but it seems the entire country from the NSA and other places rebuilt from ashes .

Comment 5
The monumental consequences for people like Mr. Doudi are entirely against equality or freedom for women , no laborer without rights , he 's totally absurd . Why is Trump , Trump will likely be President .

Comment 6
It 's time the republicans who accepted positions in this case , the birth of a bumpy