Tested cleaning methods in cleaning.py, here a smaller dataframe will be produced from the raw data that I've scraped

In [1]:
import pandas as pd
import numpy as np
import os
from bs4 import BeautifulSoup
import re
from sklearn.feature_extraction.text import CountVectorizer
import pickle
import string

In [2]:
data_path1 = 'ign_data/raw_review_html_no_edit_inc.csv' # this path doesn't exist
data_path2 = 'ign_data/unedited_review_html.csv'

In [3]:
raw_df = pd.read_csv(data_path2, sep='`')
raw_df.head()

Unnamed: 0.1,Unnamed: 0,link,some_unfiltered_html
0,0,https://www.ign.com/articles/2011/02/10/wheres...,"<script id=""__NEXT_DATA__"" type=""application/j..."
1,1,https://www.ign.com/articles/2011/02/16/playst...,"<script id=""__NEXT_DATA__"" type=""application/j..."
2,2,https://www.ign.com/articles/2011/08/30/wareho...,"<script id=""__NEXT_DATA__"" type=""application/j..."
3,3,https://www.ign.com/articles/2011/08/30/alphas...,
4,4,https://www.ign.com/articles/2011/09/29/the-sa...,"<script id=""__NEXT_DATA__"" type=""application/j..."


In [4]:
nan_df = raw_df.loc[pd.isna(raw_df['some_unfiltered_html'])]
print(nan_df.shape)
print(nan_df['link'].loc[1001])
nan_df.head()

(152, 3)
https://www.ign.com/articles/2010/05/06/energy-cf-50-floorstanding-speakers-review


Unnamed: 0.1,Unnamed: 0,link,some_unfiltered_html
3,3,https://www.ign.com/articles/2011/08/30/alphas...,
43,43,https://www.ign.com/articles/2011/07/27/captai...,
989,89,https://www.ign.com/articles/2010/02/02/alienw...,
1001,1,https://www.ign.com/articles/2010/05/06/energy...,
1343,43,https://www.ign.com/articles/2011/12/30/beavis...,


In [5]:
# separating out nan data
raw_notna_df = raw_df.loc[pd.notna(raw_df['some_unfiltered_html'])]
raw_notna_df.head()

Unnamed: 0.1,Unnamed: 0,link,some_unfiltered_html
0,0,https://www.ign.com/articles/2011/02/10/wheres...,"<script id=""__NEXT_DATA__"" type=""application/j..."
1,1,https://www.ign.com/articles/2011/02/16/playst...,"<script id=""__NEXT_DATA__"" type=""application/j..."
2,2,https://www.ign.com/articles/2011/08/30/wareho...,"<script id=""__NEXT_DATA__"" type=""application/j..."
4,4,https://www.ign.com/articles/2011/09/29/the-sa...,"<script id=""__NEXT_DATA__"" type=""application/j..."
5,5,https://www.ign.com/articles/2011/07/19/warham...,"<script id=""__NEXT_DATA__"" type=""application/j..."


In [6]:
raw_notna_df.drop(columns='Unnamed: 0',inplace=True)
raw_notna_df.reset_index(inplace=True, drop=True)
raw_notna_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


Unnamed: 0,link,some_unfiltered_html
0,https://www.ign.com/articles/2011/02/10/wheres...,"<script id=""__NEXT_DATA__"" type=""application/j..."
1,https://www.ign.com/articles/2011/02/16/playst...,"<script id=""__NEXT_DATA__"" type=""application/j..."
2,https://www.ign.com/articles/2011/08/30/wareho...,"<script id=""__NEXT_DATA__"" type=""application/j..."
3,https://www.ign.com/articles/2011/09/29/the-sa...,"<script id=""__NEXT_DATA__"" type=""application/j..."
4,https://www.ign.com/articles/2011/07/19/warham...,"<script id=""__NEXT_DATA__"" type=""application/j..."


In [7]:
# define cleaning functions

def split_script_to_parts(soup_text):
    
    #review_script = review_soup.find('script',{'id':'__NEXT_DATA__'}).string
    review_script = soup_text
    mark1 = re.search('paginatedHtmlPage', review_script)

    mod_date_place = re.search('"modifiedDate"', review_script)
    try:
        mod_date = review_script[mod_date_place.end()+2:mod_date_place.end()+26]
    except AttributeError:
        mod_date = ''

    pub_date_place = re.search('"publishDate"', review_script)
    try:
        pub_date = review_script[pub_date_place.end()+2:pub_date_place.end()+26]
    except AttributeError:
        pub_date = ''

    tag_slug_place = re.search('"tagSlugs"', review_script)
    cat_slug_place = re.search('"categorySlugs"', review_script)
    eve_slug_place = re.search('"eventSlugs"', review_script) # don't care about these, but a good stop point

    # can almost guarantee a review is a game now
    try:
        tags = review_script[tag_slug_place.end():cat_slug_place.start()]
    except AttributeError:
        tags = ''
    try:
        categories = review_script[cat_slug_place.end():eve_slug_place.start()]
    except AttributeError:
        categories = ''
    
    # cut down on how much script I have to look at and clean
    try:
        pag_html_place = re.search('paginatedHtmlPage', review_script)
        inter_script = review_script[pag_html_place.end():]
        script_start_place = re.search('":\["', inter_script)
        script_end_place = re.search('"\],"', inter_script)
        review_body = inter_script[script_start_place.end():script_end_place.start()]
    except AttributeError:
        review_body = ''

    # some other information that I may want
    auth_dets_place = re.search('"authorSocialDetails"', review_script)
    auth_dets_stop = re.search('"__typename":"Article"', review_script)
    try:
        auth_dets = review_script[auth_dets_place.end():auth_dets_stop.start()]
    except AttributeError:
        auth_dets = ''
    
    name_place_start = re.search('"name":', auth_dets)
    name_place_end = re.search(',"thumbnailUrl"', auth_dets)
    try:
        auth_name = auth_dets[name_place_start.end():name_place_end.start()]
    except AttributeError:
        auth_name = ''
    
    # review summary and score/stuff
    verd_start_place = re.search('"review":{', review_script)
    verd_end_place = re.search('"Review"}', review_script)
    try:
        verdict_full = review_script[verd_start_place.end():verd_end_place.start()]
    except AttributeError:
        verdict_full = ''

    # could try to put this from the review_script instead of the verdict piece
    verd_text_start = re.search('"comment":', verdict_full)
    verd_text_end = re.search(',"score":', verdict_full)
    verd_score_text = re.search(',"scoreText"', verdict_full)
    verd_choice = re.search('"editorsChoice"', verdict_full)

    try:
        review_summary = verdict_full[verd_text_start.end():verd_text_end.start()]
    except AttributeError:
        review_summary = ''
    try:
        review_score = verdict_full[verd_text_end.end():verd_score_text.start()]
    except AttributeError:
        review_score = ''
    try:
        review_score_text = verdict_full[verd_score_text.end():verd_choice.start()]
    except AttributeError:
        review_score_text = ''
        
    title_start = re.search('"title":', review_script)
    title_end = re.search(',"url"', review_script)
    title = review_script[title_start.end():title_end.start()]
    
    review_info = {'review_summary': review_summary, 'review_score': review_score, 
                   'review_score_text': review_score_text, 'review_body': review_body, 
                   'tags': tags, 'categories': categories, 'modified_date': mod_date, 
                   'published_date': pub_date, 'author_details': auth_dets, 
                   'author_name': auth_name, 'review_title': title}
    
    return review_info

In [15]:
def review_text_cleaner(text):
    
    final_text = re.sub('\\\\u003e', '', text)
    final_text = re.sub('\\\\u003cbr /', '', final_text)
    final_text = re.sub('\\\\u003csection', '', final_text)
    final_text = re.sub('\\\\u003c/section', '', final_text)
    final_text = re.sub('\\\\u003cp', '', final_text)
    final_text = re.sub('\\\\u003cstrong', '', final_text)
    final_text = re.sub('\\\\u003c/strong', '', final_text)
    final_text = re.sub('\\\\u003c/p', '', final_text)
    final_text = re.sub('data-transform', '', final_text)
    final_text = re.sub('=', '', final_text)
    final_text = re.sub('cellpadding', '', final_text)
    final_text = re.sub('cellspacing', '', final_text)
    final_text = re.sub('table width', '', final_text)
    final_text = re.sub('class=\'autolink\'', '', final_text)
    final_text = re.sub('class\'autolink\'', '', final_text)
    final_text = re.sub('class\\\\"autolink\\\\"', '', final_text)
    final_text = re.sub('class=\\\\"article-page\\\\"', '', final_text)
    final_text = re.sub('class\\\\"article-page\\\\"', '', final_text)
    final_text = re.sub('class\\\\"aligncenter\\\\"', '', final_text)
    final_text = re.sub('\\\\"mobile-ad-break\\\\"', '', final_text)
    final_text = re.sub('\\\\u003c/a', '', final_text)
    final_text = re.sub('\\\\u003ca', '', final_text)
    final_text = re.sub('\\\\u003c!', '', final_text)
    final_text = re.sub('\\\\u003cem', '', final_text)
    final_text = re.sub('\\\\u003c/em', '', final_text)
    final_text = re.sub('data-slug', '', final_text)
    final_text = re.sub('\\\\u0026#x2026;', '', final_text)
    final_text = re.sub('","', '', final_text)
    final_text = re.sub('\\\\u003ctr', '', final_text)
    final_text = re.sub('\\\\u003c/td', '', final_text)
    final_text = re.sub('\\\\u003ctd', '', final_text)
    final_text = re.sub('\\\\u003c/td', '', final_text)
    final_text = re.sub('data-loop', '', final_text)
    final_text = re.sub('\\\\u003cbr', '', final_text)
    final_text = re.sub('\\\\u003cBR', '', final_text)
    final_text = re.sub('data-type\\\\"slug\\\\"', '', final_text)
    final_text = re.sub('\\\\u003cspan', '', final_text)
    final_text = re.sub('\\\\u003c/span', '', final_text)
    final_text = re.sub('\\\\u003ch2', '', final_text)
    final_text = re.sub('\\\\u003c/h2', '', final_text)
    final_text = re.sub('\\\\u2212', '', final_text)
    
    
    # ign funny promo tag removal
    ign_start_iter_funny = re.finditer('Start IGN Funny promo', final_text)
    ign_end_iter_funny = re.finditer('End IGN Funny promo', final_text)
    ign_funny_starts = []
    ign_funny_ends = []
    
    for iter1 in ign_start_iter_funny:
        ign_funny_starts.append(iter1.start())
    for iter2 in ign_end_iter_funny:
        ign_funny_ends.append(iter2.end())
    
    if len(ign_funny_starts)>0 and len(ign_funny_ends)>0 and len(ign_funny_starts)==len(ign_funny_ends):
        for i in range (0, len(ign_funny_starts) ):
            if i == 0:
                mid_text = final_text[:ign_funny_starts[i]] 
                if len(ign_funny_starts)==1:
                    mid_text += final_text[ign_funny_ends[i]:]
            
            elif i >= 1 and i < len(ign_funny_starts)-1:
                mid_text += final_text[ign_funny_ends[i-1]:ign_funny_starts[i]]
            
            elif i == len(ign_funny_starts)-1:
                mid_text += final_text[ign_funny_ends[i-1]:ign_funny_starts[i]] + final_text[ign_funny_ends[i]:]
                
    
        final_text = mid_text
    
    # image div tag removal
    start_iter = re.finditer('start image div', final_text)
    end_iter = re.finditer('end image div', final_text)
    image_starts = []
    image_ends = []
    
    for iter1 in start_iter:
        image_starts.append(iter1.start() )
    for iter2 in end_iter:
        image_ends.append(iter2.end() )

    if len(image_starts)>0 and len(image_ends)>0 and len(image_starts)==len(image_ends):
        for i in range (0, len(image_starts) ):
            if i == 0:
                mid_text = final_text[:image_starts[i]] 
                if len(image_starts)==1:
                    mid_text += final_text[image_ends[i]:]
            
            elif i >= 1 and i < len(image_starts)-1:
                mid_text += final_text[image_ends[i-1]:image_starts[i]]
            
            elif i == len(image_starts)-1:
                mid_text += final_text[image_ends[i-1]:image_starts[i]] + final_text[image_ends[i]:]
                
    
        final_text = mid_text
    
    # weblinks tag removal
    href_starts = []
    href_ends = []
    href_start_iter = re.finditer('href', final_text)
    href_end_iter = re.finditer('html', final_text)
    for iter1 in href_start_iter:
        href_starts.append(iter1.start() )
    for iter2 in href_end_iter:
        href_ends.append(iter2.end() )
    
    if len(href_starts)>0 and len(href_ends)>0 and len(href_starts)==len(href_ends):
        for i in range(0, len(href_starts) ):
            if i == 0:
                mid_text = final_text[:href_starts[i]]
                if len(href_starts)==1:
                    mid_text += final_text[href_ends[i]:]
                
            elif i >= 1 and i < len(href_starts)-1:
                mid_text += final_text[href_ends[i-1]:href_starts[i]]
            elif i == len(href_starts)-1:
                mid_text += final_text[href_ends[i-1]:href_starts[i]] + final_text[href_ends[i]:]
        final_text = mid_text
    
    # certain image option tag removal
    image_ref_starts = []
    image_ref_ends = []
    image_starts_iter = re.finditer('\\\\"\\\\u003cimg', final_text)
    image_ends_iter = re.finditer('.jpg', final_text) # may want other versions of this for different images
    
    for iter1 in image_starts_iter:
        image_ref_starts.append(iter1.start() )
    for iter2 in image_ends_iter:
        image_ref_ends.append(iter2.end() )
    
    if len(image_ref_starts)>0 and len(image_ref_ends)>0 and len(image_ref_starts)==len(image_ref_ends):
        for i in range(0, len(image_ref_starts) ):
            if i == 0:
                mid_text = final_text[:image_ref_starts[i]]
                if len(image_ref_starts)==1:
                    mid_text += final_text[image_ref_ends[i]:]
            elif i >= 1 and i < len(image_ref_starts)-1:
                mid_text += final_text[image_ref_ends[i-1]:image_ref_starts[i]]
            elif i == len(image_ref_starts)-1:
                mid_text += final_text[image_ref_ends[i-1]:image_ref_starts[i]] + final_text[image_ref_ends[i]:]
        final_text = mid_text
        
    def finishing_touches(text):
        
        # some other cleaning stuff after rounds have occurred

        # split the text into an array, remove words that are probably links
        text_array = text.split(' ')
        remove_array = []
        for word in text_array:
            if len(word) > 25:
                remove_array.append(word)
        for word in remove_array:
            text_array.remove(word)
        
        final_text = ''
        
        for word in text_array:
            final_text += word
            final_text += ' '
        
        final_text = re.sub('\\\\u0026', '', final_text)
        final_text = re.sub('\\\\u003c', '', final_text)
        
        return final_text
    
    # this way I can easily check it for errors
    final_text = finishing_touches(final_text)
    
    return final_text

In [9]:
def split_clean_tags(tags):
    
    temp_list = tags.split(',')
    
    tag_list = []
    for tag in temp_list:
        inbetween = re.sub(':', '', tag)
        inbetween = re.sub('\"', '', inbetween)
        inbetween = re.sub('\[', '', inbetween)
        inbetween = re.sub('\]', '', inbetween)
        tag_list.append(inbetween)
    
    return tag_list

In [10]:
def review_second_clean_run(text):
    # to create a second corpus that will get rid of differences between sentences
    
    final_text = text.lower()
    final_text = re.sub('[%s]' % re.escape(string.punctuation), '', final_text)
    
    return final_text

In [16]:
def create_new_frame(raw_frame):
    # the raw frame just has the links and the html in it
    # only care about game reviews at the moment
    
    frame_links = []
    frame_review_body = []
    frame_review_score = []
    frame_review_summary = []
    frame_review_score_text = []
    frame_tags = []
    frame_categories = []
    frame_modified_date = []
    frame_published_date = []
    frame_author_name = []
    frame_clean_body = []
    frame_title = []
    
    frame_tag_list = []
    frame_cat_list = []
    frame_corpus_body = []
    
    for row in raw_frame.itertuples():
        # need to remove duplicate rows
        link = row.link
        raw_html = row.some_unfiltered_html
        row_dict = split_script_to_parts(raw_html)
        
        # could add a condition to only examine games here
        # may want to limit it further
        if re.search('game', row_dict['tags']):
        
            frame_links.append(link)
            frame_review_body.append(row_dict['review_body'])
            
            first_clean_body_text = review_text_cleaner(row_dict['review_body'])
            corpus_body_text = review_second_clean_run(first_clean_body_text)
            
            frame_clean_body.append(first_clean_body_text )
            frame_corpus_body.append(corpus_body_text )
            
            frame_review_summary.append(row_dict['review_summary'])
            frame_review_score.append(row_dict['review_score'])
            frame_review_score_text.append(row_dict['review_score_text'])
            frame_tags.append(row_dict['tags'])
            frame_categories.append(row_dict['categories'])
            frame_modified_date.append(row_dict['modified_date'])
            frame_published_date.append(row_dict['published_date'])
            frame_author_name.append(row_dict['author_name'])
            frame_title.append(row_dict['review_title'])
            
            frame_tag_list.append(split_clean_tags(row_dict['tags']) )
            frame_cat_list.append(split_clean_tags(row_dict['categories']) )
            
            # could also clean tags and categories before placing them 
            # can clean review summaries
        
    new_frame_dict = {'links': frame_links, 'rough_review_body': frame_review_body, 
                      'cleaned_review_body': frame_clean_body, 
                      'review_summary_rough': frame_review_summary, 'review_score': frame_review_score, 
                      'tags': frame_tags, 'categories': frame_categories, 
                      'modified_date': frame_modified_date, 'published_date': frame_published_date, 
                      'author_name': frame_author_name, 'tag_list': frame_tag_list, 
                      'categories_list': frame_cat_list, 'review_title': frame_title, 
                      'review_body_corpus': frame_corpus_body}
    
    set_frame = pd.DataFrame.from_dict(new_frame_dict)
    
    return set_frame

In [17]:
# creating new dataframe
changed_df = create_new_frame(raw_notna_df)
# removing duplicate rows
changed_df = changed_df[~changed_df.duplicated(['cleaned_review_body'])]

In [18]:
print(changed_df.shape)
changed_df.head()

(13565, 14)


Unnamed: 0,links,rough_review_body,cleaned_review_body,review_summary_rough,review_score,tags,categories,modified_date,published_date,author_name,tag_list,categories_list,review_title,review_body_corpus
0,https://www.ign.com/articles/2011/02/16/playst...,"\u003csection class=\""article-page\""\u003eIt m...","It may be surprising to hear, but the Sharp S...",,8.5,":[""review"",""playstation-3"",""blogroll"",""games"",...",":[""ign"",""ps3"",""tech""],",2011-10-17T23:49:14+0000,2011-02-16T10:42:00+0000,"""Scott Lowe""","[review, playstation-3, blogroll, games, gear,...","[ign, ps3, tech, ]","""PlayStation Move Sharp Shooter Review""",it may be surprising to hear but the sharp sh...
1,https://www.ign.com/articles/2011/07/19/warham...,"\u003csection class=\""article-page\""\u003e\u00...",'Warhammer 40K: Kill Team might be unfortun...,"""And \""also-ran\"" hangs all over Warhammer 40K...",6.5,":[""review"",""blogroll"",""event-essentials"",""hot""...",":[""ign"",""ps3"",""xbox-360"",""xbox-live""],",2011-10-18T05:59:32+0000,2011-07-19T19:47:00+0000,"""Arthur Gies""","[review, blogroll, event-essentials, hot, lega...","[ign, ps3, xbox-360, xbox-live, ]","""Warhammer 40K: Kill Team Review""",warhammer 40k kill team might be unfortunat...
2,https://www.ign.com/articles/2011/07/20/call-o...,"\u003csection class=\""article-page\""\u003eThe ...",The Call of Juarez franchise always intereste...,"""Call of Juarez: The Cartel is a poor change o...",4.5,":[""review"",""blogroll"",""call-of-juarez-the-cart...",":[""ign"",""pc"",""ps3"",""xbox-360""],",2011-10-18T06:02:02+0000,2011-07-20T00:14:00+0000,"""Anthony Gallegos""","[review, blogroll, call-of-juarez-the-cartel, ...","[ign, pc, ps3, xbox-360, ]","""Call of Juarez: The Cartel Review""",the call of juarez franchise always intereste...
3,https://www.ign.com/articles/2011/07/20/captai...,"\u003csection class=\""article-page\""\u003eThe ...",The majority of movie games are just awful. S...,"""Captain America: Super Soldier is a mediocre ...",5.5,":[""review"",""blogroll"",""legacy"",""wii"",""games"",""...",":[""ign"",""wii""],",2011-10-18T06:02:20+0000,2011-07-20T00:43:00+0000,"""Audrey Drake""","[review, blogroll, legacy, wii, games, captain...","[ign, wii, ]","""Captain America: Super Soldier Wii Review""",the majority of movie games are just awful st...
4,https://www.ign.com/articles/2011/07/20/quiz-c...,"\u003csection class=\""article-page\""\u003e\u00...","'Quiz Climber, from Buzz! developer 'Rele...","""Ultimately, Quiz Climber is just a little too...",6.5,":[""review"",""blogroll"",""games"",""legacy"",""wirele...",":[""ign"",""wireless""],",2011-10-18T06:07:06+0000,2011-07-20T21:44:00+0000,"""Justin Davis""","[review, blogroll, games, legacy, wireless, qu...","[ign, wireless, ]","""Quiz Climber Review""",quiz climber from buzz developer relentle...


In [20]:
print(changed_df.shape)
loc=13000
# print(changed_df['cleaned_review_body'].loc[1])
print(changed_df['links'].loc[loc], '\n')
# names have various writing quirks to them, need to identify what went wrong
# print(pd.unique(changed_df['author_name']))
print(changed_df['review_title'].loc[loc], '\n')
print('score = ', changed_df['review_score'].loc[loc],'\n')
print(changed_df['tag_list'].loc[loc],'\n')
print(changed_df['categories_list'].loc[loc],'\n')
print('author = ', changed_df['author_name'].loc[loc], '\n')
print(changed_df['cleaned_review_body'].loc[loc], '\n')
print(changed_df['review_summary_rough'].loc[loc])

(13565, 14)
https://www.ign.com/articles/2018/10/11/luigis-mansion-for-3ds-review 

"Luigi's Mansion for 3DS Review" 

score =  7 

['review', 'game', 'luigis-mansion', '3ds', 'nintendo-ead', 'nintendo', 'adventure', 'mario', ''] 

['ign', 'nintendo', ''] 

author =  "Jared Petty" 

 Genre distinctions are supposed to exist for our convenience, but Luigi's Mansion defies them all. It exists somewhere between a kid-friendly take on survival horror, a competent puzzle adventure, and a terrific unlicensed Ghostbusters game. The new 3DS version of the Gamecube launch title adds a few new features to the original, but Luigi's Mansion on 3DS is a faithful port that delivers fun moments while showing its age in some key areas. \n style\"color: #99cc00\"The Fear of LuigiLuigi's Mansion's gameplay is entertaining but simple: Enter a new area, use detection gear and wits to solve a mystery, fight a mini-boss ghost, loot the room, and move on. I enjoyed many of the distinctive encounters that inv

### Decided to check a couple of things below
and save the processing to a file

In [13]:
# write changed_df to file
changed_df.to_csv('ign_data/reviews_first_processed.csv', sep='`')

In [14]:
# all of the tech tags may indicate reviews that aren't games
# to verify something is a game, may want to include a list of 
# 
null_df = changed_df.loc[changed_df['review_summary_rough']=='null']
print(null_df.shape)
null_df.head()

(2527, 14)


Unnamed: 0,links,rough_review_body,cleaned_review_body,review_summary_rough,review_score,tags,categories,modified_date,published_date,author_name,tag_list,categories_list,review_title,review_body_corpus
0,https://www.ign.com/articles/2011/02/16/playst...,"\u003csection class=\""article-page\""\u003eIt m...","It may be surprising to hear, but the Sharp S...",,8.5,":[""review"",""playstation-3"",""blogroll"",""games"",...",":[""ign"",""ps3"",""tech""],",2011-10-17T23:49:14+0000,2011-02-16T10:42:00+0000,"""Scott Lowe""","[review, playstation-3, blogroll, games, gear,...","[ign, ps3, tech, ]","""PlayStation Move Sharp Shooter Review""",it may be surprising to hear but the sharp sh...
34,https://www.ign.com/articles/2011/04/19/mortal...,"\u003csection class=\""article-page\""\u003eThis...","This year, \""Mortal Kombat returns to its v...",,9.0,":[""review"",""xbox-360"",""blogroll"",""games"",""gear...",":[""ign"",""ps3"",""tech"",""xbox-360""],",2011-10-18T19:24:33+0000,2011-04-19T14:01:00+0000,"""Scott Lowe""","[review, xbox-360, blogroll, games, gear, lega...","[ign, ps3, tech, xbox-360, ]","""Mortal Kombat Fight Stick Review""",this year mortal kombat returns to its viol...
35,https://www.ign.com/articles/2011/03/23/crysis...,"\u003csection class=\""article-page\""\u003eAt l...","At long last, \""Crysis 2 has arrived, and ...",,6.0,":[""review"",""playstation-3"",""blogroll"",""crysis-...",":[""ps3"",""tech""],",2011-10-18T19:25:50+0000,2011-03-23T08:11:00+0000,"""Scott Lowe""","[review, playstation-3, blogroll, crysis-2, cr...","[ps3, tech, ]","""Crysis 2 Bluetooth Headset Review""",at long last crysis 2 has arrived and we l...
36,https://www.ign.com/articles/2011/03/11/thrust...,"\u003csection class=\""article-page\""\u003eTher...","There is a mathematical fact about the \""Th...",,8.0,":[""review"",""playstation-3"",""blogroll"",""games"",...",":[""ign"",""ps3"",""tech""],",2011-10-18T19:27:41+0000,2011-03-11T10:10:00+0000,"""Ryan Geddes""","[review, playstation-3, blogroll, games, gear,...","[ign, ps3, tech, ]","""Thrustmaster T500 RS Review""",there is a mathematical fact about the thru...
37,https://www.ign.com/articles/2000/09/18/micros...,"\u003csection class=\""article-page\""\u003eNow ...",Now that it's been confirmed that the good ol...,,,":[""review"",""accessories"",""gear"",""legacy"",""micr...",":[""tech""],",2011-10-21T23:21:18+0000,2000-09-18T07:00:00+0000,"""Vinny Lopez""","[review, accessories, gear, legacy, microsoft,...","[tech, ]","""Microsoft Sidewinder Game Pad Pro Review""",now that its been confirmed that the good ol ...


In [15]:
# reviews by ign staff
ign_staff_df = changed_df.loc[changed_df['author_name']=='"IGN Staff"']
print(ign_staff_df.shape)
ign_staff_df.head()

(803, 14)


Unnamed: 0,links,rough_review_body,cleaned_review_body,review_summary_rough,review_score,tags,categories,modified_date,published_date,author_name,tag_list,categories_list,review_title,review_body_corpus
19,https://www.ign.com/articles/2011/07/26/stickm...,"\u003csection class=\""article-page\""\u003e\u00...",'Stickman BMX from 'Traction Games is pla...,"""Stickman BMX has a lot more content than most...",6.0,":[""review"",""blogroll"",""games"",""legacy"",""wirele...",":[""ign"",""wireless""],",2011-10-18T06:30:20+0000,2011-07-26T17:48:00+0000,"""IGN Staff""","[review, blogroll, games, legacy, wireless, st...","[ign, wireless, ]","""Stickman BMX (iOS) Review""",stickman bmx from traction games is platf...
229,https://www.ign.com/articles/2011/11/15/big-re...,"\u003csection class=\""article-page\""\u003eToda...",Today is the biggest day of the year for game...,,,":[""feature"",""blogroll"",""games"",""legacy"",""video...",":[""games"",""ign"",""pc"",""ps3"",""wii"",""xbox-360""],",2012-01-17T20:11:45+0000,2011-11-15T18:58:00+0000,"""IGN Staff""","[feature, blogroll, games, legacy, videogame, ]","[games, ign, pc, ps3, wii, xbox-360, ]","""Big Reviews Round-Up""",today is the biggest day of the year for game...
1560,https://www.ign.com/articles/2011/04/19/world-...,"\u003csection class=\""article-page\""\u003eIGN ...","IGN gave 'World of Goo a \""near-perfect sc...","""World of Goo is a perfect fit for the iPhone....",9.5,":[""review"",""blogroll"",""event-essentials"",""game...",":[""ign"",""wireless""],",2012-05-08T06:22:18+0000,2011-04-19T18:20:00+0000,"""IGN Staff""","[review, blogroll, event-essentials, games, ho...","[ign, wireless, ]","""World of Goo iPhone Review""",ign gave world of goo a nearperfect score ...
1592,https://www.ign.com/articles/2011/04/12/the-si...,"\u003csection class=\""article-page\""\u003eDon'...",Don't be fooled by the box art \u0026#x2013; ...,"""It\u0026#8217;s a pleasant surprise that The ...",7.5,":[""review"",""legacy"",""nintendo-ds"",""games"",""the...",":[""ds""],",2012-05-08T07:00:31+0000,2011-04-12T22:07:00+0000,"""IGN Staff""","[review, legacy, nintendo-ds, games, the-sims-...","[ds, ]","""The Sims 3 3DS Review""",dont be fooled by the box art u0026x2013 th...
1794,https://www.ign.com/articles/2010/07/15/the-pl...,"\u003csection class=\""article-page\""\u003eOne ...",One of the best things about gaming on mobile...,"""Yes, I'm gushing over the Plateau, but I cann...",8.0,":[""review"",""games"",""legacy"",""wireless"",""the-pl...",":[""wireless""],",2012-05-08T10:24:38+0000,2010-07-15T21:20:00+0000,"""IGN Staff""","[review, games, legacy, wireless, the-plateau,...","[wireless, ]","""The Plateau Android Review""",one of the best things about gaming on mobile...


Should also 

In [16]:
# create final df that I wish to pass to the 
ign_df = changed_df.loc[changed_df['review_summary_rough']!='null']
ign_df.to_csv('ign_data/ign_pass_one.csv', sep='`')

In [17]:
# and pickle it
ign_df.to_pickle('ign_data/ign_pass_one.pkl')

Want to create a document term matrix as well

- should handle several of the terms in the dtm

In [18]:
cv = CountVectorizer(stop_words='english') # consider adding bigrams in a dtm later
#cv_bigram = CountVectorizer(stop_words='english', ngram_range=(1,2))
ign_body_cv = cv.fit_transform(ign_df.cleaned_review_body)
ign_body_dtm = pd.DataFrame(ign_body_cv.toarray(), columns=cv.get_feature_names() )
ign_body_dtm.index = ign_df.index

ign_body_dtm.head()

Unnamed: 0,00,000,000000,00001,00005,00028,00039,00044,00050,000dpi,...,zx,zxs,zynga,zz,zzz,zzzz,état,été,ôkami,über
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [19]:
# pickle dtm
ign_body_dtm.to_pickle('ign_data/ign_pass_one_dtm.pkl')


In [20]:
# pickle count vectorizer object also
pickle.dump(cv, open("ign_data/cv_pass_one.pkl", "wb"))

In [22]:
# dtm for corpus
# same variables named to keep memory down
# probably hold off on this one until I get further into the analysis and need more refinement

cv = CountVectorizer(stop_words='english')
ign_body_cv = cv.fit_transform(ign_df.review_body_corpus)
ign_body_dtm = pd.DataFrame(ign_body_cv.toarray(), columns=cv.get_feature_names() )
ign_body_dtm.index = ign_df.index

ign_body_dtm.to_csv('ign_data/ign_corpus_pass_one_dtm.csv') # couldn't use pickle, file too large
pickle.dump(cv, open('ign_data/corpus_cv_pass_one.pkl', 'wb'))