In [1]:
# Import libaries
import numpy as np
import pandas as pd
import requests
from tqdm import tqdm

### Read in table with titles to be scraped

In [2]:
no_dupes = pd.read_csv('../data/imdb_titles_no_dupes.csv')

In [3]:
no_dupes.shape

(2484, 9)

In [4]:
no_dupes.head(10)

Unnamed: 0,name,href,years,imdb_description,pg_rating,imdb_genre_tags,imdb_rating,num_votes,img_thumbnail
0,Game of Thrones,/title/tt0944947/,(2011–2019),Nine noble families fight for control over the...,TV-MA,"Action, Adventure, Drama",9.2,2148311,https://m.media-amazon.com/images/M/MV5BYTRiND...
1,Prison Break,/title/tt0455275/,(2005–2017),"Due to a political conspiracy, an innocent man...",TV-14,"Action, Crime, Drama",8.3,548267,https://m.media-amazon.com/images/M/MV5BMTg3NT...
2,Vikings,/title/tt2306299/,(2013–2020),Vikings transports us to the brutal and myster...,TV-MA,"Action, Adventure, Drama",8.5,547494,https://m.media-amazon.com/images/M/MV5BODk4Zj...
3,The Boys,/title/tt1190634/,(2019– ),A group of vigilantes set out to take down cor...,TV-MA,"Action, Comedy, Crime",8.7,542317,https://m.media-amazon.com/images/M/MV5BOTEyND...
4,The Mandalorian,/title/tt8111088/,(2019– ),The travels of a lone bounty hunter in the out...,TV-14,"Action, Adventure, Fantasy",8.7,527088,https://m.media-amazon.com/images/M/MV5BZjRlZD...
5,The Witcher,/title/tt5180504/,(2019– ),"Geralt of Rivia, a solitary monster hunter, st...",TV-MA,"Action, Adventure, Drama",8.1,513940,https://m.media-amazon.com/images/M/MV5BN2FiOW...
6,Money Heist,/title/tt6468322/,(2017–2021),An unusual group of robbers attempt to carry o...,TV-MA,"Action, Crime, Drama",8.2,491872,https://m.media-amazon.com/images/M/MV5BODI0ZT...
7,Squid Game,/title/tt10919420/,(2021– ),Hundreds of cash-strapped players accept a str...,TV-MA,"Action, Drama, Mystery",8.0,469810,https://m.media-amazon.com/images/M/MV5BYWE3MD...
8,Daredevil,/title/tt3322312/,(2015–2018),"A blind lawyer by day, vigilante by night. Mat...",TV-MA,"Action, Crime, Drama",8.6,450320,https://m.media-amazon.com/images/M/MV5BODcwOT...
9,Arrow,/title/tt2193021/,(2012–2020),Spoiled billionaire playboy Oliver Queen is mi...,TV-14,"Action, Adventure, Crime",7.5,436286,https://m.media-amazon.com/images/M/MV5BMTI0NT...


### Define functions for our API call

In [5]:
def wiki_text_query(inp_subject, start_year=""):
    '''a wrappper for a function call to wiki_api_query_internal(subject) that first gets the "correct" search term'''

    warning_flag, search_string, search_url = wiki_search_term(inp_subject, start_year)

    try:
        output = wiki_text_query_internal(search_string)
        if len(output) == 0:
            print(f"Warning: 0 length output for: {search_string}")
        
        return warning_flag, search_string, search_url, output

    except:
        print(f"ERROR 2: broken search for: {search_string}")
        return warning_flag, "", "", ""
    

In [6]:
def wiki_search_term(inp_subject, start_year=""):
    '''returns the "correct" search term to use, that first tries to search for _____({year } TV_series) 
        then tries _____(TV_series), before trying ______
        Warning Flag codes:
        0 - page found under _____({year } TV_series)
        1 - page found under _____(TV_series)
        2 - page found under _____
        3 - no page found for search term
        4 - page found using manual override search term
        '''

    warningflag = 0
    
    S = requests.Session()
    URL = "https://en.wikipedia.org/w/api.php"
    
    PARAMS = {
        "action": "opensearch",
        "namespace": "0",
        "limit": "5",
        "format": "json"
    }

    R = S.get(url=URL, params=PARAMS | {'search': inp_subject + f" ({start_year} TV series)"})
    DATA = R.json()
    # print(f"DATA: {DATA}")

    if len(DATA[1]) > 0:
        return warningflag, DATA[1][0], DATA[3][0]
    else:
        R = S.get(url=URL, params=PARAMS | {'search': inp_subject + " (TV series)"})
        DATA = R.json()
        warningflag = 1

        if len(DATA[1]) > 0:
            return warningflag, DATA[1][0], DATA[3][0]
        else:
            R = S.get(url=URL, params=PARAMS | {'search': inp_subject})
            DATA = R.json()
            warningflag = 2

            if len(DATA[1]) > 0:
                return warningflag, DATA[1][0], DATA[3][0]
            else:
                warningflag = 3
                return warningflag, DATA[0], ""

# Reference: https://www.mediawiki.org/wiki/API:Opensearch

In [7]:
def wiki_text_query_internal(subject):

    url = 'https://en.wikipedia.org/w/api.php'

    params = {
            'action': 'query',
            'format': 'json',
            'titles': subject,
            'prop': 'extracts',
            # 'exintro': True,      #if True- returns just the first blurb on page
            'explaintext': True,
            'redirects':True,
        }
    
    response = requests.get(url, params=params)
    data = response.json()
    
    page = next(iter(data['query']['pages'].values()))

    return page['extract']

    
# Reference: https://www.jcchouinard.com/wikipedia-api/
# Reference: https://www.mediawiki.org/wiki/API:Parsing_wikitext
# Reference: https://www.mediawiki.org/wiki/API:Get_the_contents_of_a_page

In [None]:
s = requests.Session()
s.auth = ('user', 'pass')
s.headers.update({'x-test': 'true'})

# both 'x-test' and 'x-test2' are sent
s.get('http://httpbin.org/headers', headers={'x-test2': 'true'})

In [8]:
def override_query(inp_subject, start_year=""):
    '''direct search without trying alternate terms'''

    warning_flag = 4
    search_string = inp_subject
    
    search_url = f"https://en.wikipedia.org/wiki/{inp_subject.replace(' ', '_')}"

    try:
        output = wiki_text_query_internal(search_string)
        if len(output) == 0:
            print(f"Warning: 0 length output for: {search_string}")
        
        return warning_flag, search_string, search_url, output

    except:
        print(f"ERROR 2: broken search for: {search_string}")
        return warning_flag, "", "", ""

# Reference: https://www.jcchouinard.com/wikipedia-api/
# Reference: https://www.mediawiki.org/wiki/API:Parsing_wikitext
# Reference: https://www.mediawiki.org/wiki/API:Get_the_contents_of_a_page

### API query for the article text of each entry in our list

In [9]:
# Create an array of the start year of each series, to help handle edge cases where the year is contained in the URL

start_years = {}
for i in range(len(no_dupes)):
    year_entry = no_dupes['years'][i]
    temp_read_in = year_entry[1:5]
    if (temp_read_in[0] != '1') and (temp_read_in[0] != '2'):
        j = 0
        while j < len(year_entry):
            if (year_entry[j] == '1') or (year_entry[j] == '2'):
                break
            j += 1
        start_index = j-1
        temp_read_in = year_entry[j: j+4]
    
    start_years[i] = temp_read_in

In [10]:
wiki_articles = {}

for i in tqdm(range(len(no_dupes))):

    name = no_dupes['name'][i]
    start_year = start_years[i]
    
    warning_flag, search_term, search_url, wiki_text = wiki_text_query(name, start_year)
    
    entry = {
        'index_no': i,
        'name' : name,
        'wiki_warning_flag' : warning_flag,
        'wiki_search_term' : search_term,
        'wiki_search_url': search_url,
        'wiki_text' : wiki_text
    }

    wiki_articles[i] = entry

 15%|█▌        | 375/2484 [03:31<19:17,  1.82it/s]

ERROR 2: broken search for: Tokyo Ghoul: Root A


 15%|█▌        | 380/2484 [03:34<21:13,  1.65it/s]

ERROR 2: broken search for: Rurouni Kenshin: Trust and Betrayal


 30%|██▉       | 744/2484 [07:12<17:44,  1.64it/s]

ERROR 2: broken search for: The Wheel of Time: Origins


 34%|███▍      | 848/2484 [08:16<15:18,  1.78it/s]

ERROR 2: broken search for: Owari no serafu


 36%|███▋      | 903/2484 [08:48<15:55,  1.65it/s]

ERROR 2: broken search for: Scam 1992: The Harshad Mehta Story


 38%|███▊      | 946/2484 [09:12<14:48,  1.73it/s]

ERROR 2: broken search for: Self Made: Inspired by the Life of Madam C.J. Walker


 48%|████▊     | 1183/2484 [11:31<13:02,  1.66it/s]

ERROR 2: broken search for: The Filthy Frank Show


 66%|██████▌   | 1641/2484 [15:49<09:36,  1.46it/s]

ERROR 2: broken search for: Inside Look: The Assassination of Gianni Versace - American Crime Story


 66%|██████▌   | 1644/2484 [15:51<08:51,  1.58it/s]

ERROR 2: broken search for: Jeen-yuhs: A Kanye Trilogy


 69%|██████▊   | 1704/2484 [16:27<07:30,  1.73it/s]

ERROR 2: broken search for: FIFA Uncovered


 69%|██████▉   | 1711/2484 [16:31<07:38,  1.69it/s]

ERROR 2: broken search for: I Just Killed My Dad


 69%|██████▉   | 1719/2484 [16:36<07:24,  1.72it/s]

ERROR 2: broken search for: Sins of Our Mother


 83%|████████▎ | 2061/2484 [19:48<04:05,  1.72it/s]

ERROR 2: broken search for: A Young Doctor's Notebook & Other Stories


 83%|████████▎ | 2074/2484 [19:55<03:48,  1.79it/s]

ERROR 2: broken search for: Pera Palas'ta Gece Yarisi


 84%|████████▎ | 2078/2484 [19:57<03:41,  1.83it/s]

ERROR 2: broken search for: The Heavy Water War: Stopping Hitler's Atomic Bomb


 99%|█████████▉| 2460/2484 [23:29<00:13,  1.77it/s]

ERROR 2: broken search for: Doctor Foster: A Woman Scorned


100%|██████████| 2484/2484 [23:42<00:00,  1.75it/s]


In [39]:
wiki_articles[0]

{'index_no': 0,
 'name': 'Game of Thrones',
 'wiki_search_term': 'Game of thrones (TV series)',
 'wiki_search_url': 'https://en.wikipedia.org/wiki/Game_of_thrones_(TV_series)',
 'wiki_text': 'Game of Thrones is an American fantasy drama television series created by David Benioff and D. B. Weiss for HBO. It is an adaptation of A Song of Ice and Fire, a series of fantasy novels by George R. R. Martin, the first of which is A Game of Thrones. The show was shot in the United Kingdom, Canada, Croatia, Iceland, Malta, Morocco, and Spain. It premiered on HBO in the United States on April 17, 2011, and concluded on May 19, 2019, with 73 episodes broadcast over eight seasons.\nSet on the fictional continents of Westeros and Essos, Game of Thrones has a large ensemble cast and follows several story arcs throughout the course of the show. The first major arc concerns the Iron Throne of the Seven Kingdoms of Westeros through a web of political conflicts among the noble families either vying to cla

### Create a DataFrame from the extracted data

In [12]:
wiki_articles_df = pd.DataFrame.from_dict(wiki_articles, orient='index')

In [40]:
wiki_articles_df.head()

Unnamed: 0,index_no,name,wiki_warning_flag,wiki_search_term,wiki_search_url,wiki_text
0,0,Game of Thrones,1,Game of thrones (TV series),https://en.wikipedia.org/wiki/Game_of_thrones_...,Game of Thrones is an American fantasy drama t...
1,1,Prison Break,1,Prison break (TV series),https://en.wikipedia.org/wiki/Prison_break_(TV...,Prison Break is an American serial drama telev...
2,2,Vikings,0,Vikings (2013 TV series),https://en.wikipedia.org/wiki/Vikings_(2013_TV...,Vikings is a historical drama television serie...
3,3,The Boys,0,The Boys (2019 TV series),https://en.wikipedia.org/wiki/The_Boys_(2019_T...,The Boys is an American superhero television s...
4,4,The Mandalorian,1,The Mandalorian (TV series),https://en.wikipedia.org/wiki/The_Mandalorian_...,The Mandalorian is an American space Western t...


In [14]:
#double-checking no entry count mismatch
len(wiki_articles_df)

2484

### Identifying entries that did not run as intended

In [15]:
wiki_articles_df['wiki_warning_flag'].value_counts()

1    1116
2    1010
0     342
3      16

##### Blank entries:

In [16]:
wiki_articles_df[wiki_articles_df['wiki_text'] == '']

Unnamed: 0,index_no,name,wiki_warning_flag,wiki_search_term,wiki_search_url,wiki_text
374,374,Tokyo Ghoul: Root A,3,,,
379,379,Rurouni Kenshin: Trust and Betrayal,3,,,
743,743,The Wheel of Time: Origins,3,,,
847,847,Owari no serafu,3,,,
902,902,Scam 1992: The Harshad Mehta Story,3,,,
945,945,Self Made: Inspired by the Life of Madam C.J. ...,3,,,
1182,1182,The Filthy Frank Show,3,,,
1640,1640,Inside Look: The Assassination of Gianni Versa...,3,,,
1643,1643,Jeen-yuhs: A Kanye Trilogy,3,,,
1703,1703,FIFA Uncovered,3,,,


Empty Entries:
> Tokyo Ghoul: Root A ;	
Rurouni Kenshin: Trust and Betrayal ;	
The Wheel of Time: Origins ;	
Owari no serafu	;
Scam 1992: The Harshad Mehta Story ;	
Self Made: Inspired by the Life of Madam C.J. Walker ;	
The Filthy Frank Show ;	
Inside Look: The Assassination of Gianni Versace - American Crime Story ;	
Jeen-yuhs: A Kanye Trilogy ;	
FIFA Uncovered ;	
I Just Killed My Dad ;	
Sins of Our Mother ;	
A Young Doctor's Notebook & Other Stories ;	
Pera Palas'ta Gece Yarisi ;	
The Heavy Water War: Stopping Hitler's Atomic Bomb ;	
Doctor Foster: A Woman Scorned ;

In [17]:
# Create a dictionary mapping from old search term to new (override) search term

search_term_transform = {
    "Tokyo Ghoul: Root A" : "Tokyo Ghoul",
    "Rurouni Kenshin: Trust and Betrayal" : "Rurouni Kenshin: Trust & Betrayal",
    "The Wheel of Time: Origins" : "The Wheel of Time",
    "Owari no serafu" : "Seraph of the End",
    "Scam 1992: The Harshad Mehta Story" : "Scam 1992",
    "Self Made: Inspired by the Life of Madam C.J. Walker" : "Self Made (miniseries)",
    "The Filthy Frank Show" : "Joji (musician)",
    "Inside Look: The Assassination of Gianni Versace - American Crime Story" : "The Assassination of Gianni Versace: American Crime Story",
    "Jeen-yuhs: A Kanye Trilogy" : "Jeen-Yuhs",
    # "FIFA Uncovered" : "",        # No applicable Wikipedia entry found
    # "I Just Killed My Dad" : "",  # No applicable Wikipedia entry found
    "Sins of Our Mother" : "Killings of Tylee Ryan and J. J. Vallow",
    "A Young Doctor's Notebook & Other Stories" : "A Young Doctor's Notebook (TV series)",
    "Pera Palas'ta Gece Yarisi" : "Midnight at the Pera Palace",
    "The Heavy Water War: Stopping Hitler's Atomic Bomb" : "The Heavy Water War",
    "Doctor Foster: A Woman Scorned" : "Doctor Foster (TV series)"
}

In [18]:
# Fixing the entries that came up empty

for name in search_term_transform:

    index_no = int(wiki_articles_df[wiki_articles_df['name'] == name]['index_no'])
    start_year = ""
    
    warning_flag, search_term, search_url, wiki_text = override_query(search_term_transform[name] )
    
    wiki_articles[index_no]['wiki_warning_flag'] = warning_flag
    wiki_articles[index_no]['wiki_search_term'] = search_term
    wiki_articles[index_no]['wiki_search_url'] = search_url
    wiki_articles[index_no]['wiki_text'] = wiki_text


In [19]:
#double-check for missing entries (2 expected)
wiki_articles_df = pd.DataFrame.from_dict(wiki_articles, orient='index')
len(wiki_articles_df[wiki_articles_df['wiki_text'] == ''])

2

In [20]:
wiki_articles_df[wiki_articles_df['wiki_text'] == '']

Unnamed: 0,index_no,name,wiki_warning_flag,wiki_search_term,wiki_search_url,wiki_text
1703,1703,FIFA Uncovered,3,,,
1710,1710,I Just Killed My Dad,3,,,


##### Entries containing the phrase "may refer to" means the wrong page (disambiguation) was scraped

In [21]:
may_refer_to_df = wiki_articles_df[wiki_articles_df['wiki_text'].str.contains("may refer to")]

In [22]:
may_refer_to_df.shape

(66, 6)

In [23]:
may_refer_to_df.index

Int64Index([  87,  360,  397,  460,  472,  663,  679,  708,  756,  783,  792,
             821,  836,  843,  860,  941,  944, 1065, 1088, 1136, 1168, 1208,
            1218, 1246, 1292, 1359, 1374, 1397, 1398, 1467, 1493, 1506, 1508,
            1518, 1569, 1570, 1601, 1727, 1740, 1798, 1814, 1821, 1833, 1853,
            1870, 1904, 1931, 1953, 1956, 1960, 1981, 2038, 2042, 2102, 2193,
            2205, 2209, 2251, 2255, 2306, 2310, 2311, 2323, 2335, 2447, 2463],
           dtype='int64')

In [24]:
may_refer_to_df[['name']]

Unnamed: 0,name
87,The Family Man
360,The River
397,In the Dark
460,Law of the Lawless
472,Tomorrow
...,...
2311,Fingersmith
2323,Still Standing
2335,Run
2447,The View


In [25]:
# Create a dictionary mapping from old search term to new (override) search term

search_term_transform2 = {
    'The Family Man' : 'The Family Man (Indian TV series)', 
    'The River': 'The River (U.S. TV series)', 
    'In the Dark': 'In the Dark (American TV series)', 
    'Law of the Lawless': 'Brigada', 
    'Tomorrow': 'Tomorrow (South Korean TV series)', 
    'Wrecked': 'Wrecked (American TV series)', 
    'Into the West': 'Into the West (miniseries)',
    'Big Mouth': 'Big Mouth (American TV series)', 
    'Food Wars': 'Food Wars!: Shokugeki no Soma', 
    'Gantz': 'Gantz', 
    'Fighting Spirit': 'Hajime no Ippo', 
    'The Liberator': 'The Liberator (miniseries)',
    'Golden Boy': 'Golden Boy (manga)', 
    'When They Cry': 'Higurashi When They Cry', 
    '86': '86 (novel_series)', 
    'Carlos': 'Carlos (miniseries)', 
    'Aquarius': 'Aquarius (American TV series)', 
    'Louie': 'Louie (American TV series)', 
    'Peep Show': 'Peep Show (British TV series)', 
    'Wilfred': 'Wilfred (American TV series)', 
    'Last Man Standing': 'Last Man Standing (American TV series)', 
    'Rescue Me': 'Rescue Me (American TV series)',
    # 'Being Human': 'Being Human (North American TV series)',  #gives error- handle separately
    'Crashing': 'Crashing (British TV series)', 
    'The Kingdom': 'The Kingdom (miniseries)', 
    'The Killing': 'The Killing (U.S. TV series)', 
    'The Outsider': 'The Outsider (miniseries)',
    'The Stranger': 'The Stranger (British TV series)', 
    'Leverage': 'Leverage (American TV series)', 
    'Big Sky': 'Big Sky (American TV series)', 
    'True Story': 'True Story (miniseries)', 
    # 'Shameless': 'Shameless (British TV series)',  #gives error- handle separately
    'The Finder': 'The Finder (American TV series)',
    'Wallander': 'Wallander (British TV series)',
    'Black Sun': 'Balkan Shadows',
    'Bordertown': 'Bordertown (Finnish TV series)',
    'Ashes to Ashes': 'Ashes to Ashes (British TV series)',
    'Band of Brothers': 'Band of Brothers (miniseries)',
    'Heroes': 'Heroes (American TV series)',
    'Dopesick': 'Dopesick (miniseries)',
    'The Magicians': 'The Magicians (American TV series)',
    'Little Fires Everywhere': 'Little Fires Everywhere (miniseries)',
    'Messiah': 'Messiah (American TV series)',
    'Heartland': 'Heartland (Canadian TV series)',
    'Shake It Up': 'Shake It Up (American TV series)',
    'Wipeout': 'Wipeout (2008 game show)',
    # 'Rebelde': 'Rebelde',  #gives error- handle separately
    'The Gift': 'The Gift (Turkish TV series)',
    # 'The Returned': '',  # index 1956 /title/tt2521668/	The Returned (French TV series)
    'The Shining': 'The Shining (TV miniseries)',
    # 'The Returned': '', # index 1981 /title/tt3230780/	The Returned (American TV series)
    'The Circle': 'The Circle (American TV series)',
    'Face Off': 'Face Off (American TV series)',
    'Rose Red': 'Rose Red (miniseries)',
    # 'Taken': 'Taken (miniseries)',  #gives error- handle separately
    'Echoes': 'Echoes (miniseries)',
    'Flower of Evil': 'Flower of Evil (South Korean TV series)',
    'North & South': 'North & South (TV serial)',
    'The L Word': 'The L Word',
    'The Thorn Birds': 'The Thorn Birds (miniseries)',
    'True Beauty': 'True Beauty (South Korean TV series)',
    'Fingersmith': 'Fingersmith (TV serial)',
    'Still Standing': 'Still Standing (American TV series)',
    'Run': 'Run (American TV series)',
    'The View': 'The View (talk show)',
    'What/If': 'What/If'
}

In [26]:
# Fixing the entries that grabbed data from the disambiguation page

for name in search_term_transform2:
    # print(f"{name}; {search_term_transform2[name]}")  #used to identify outlier cases that need to be fixed separately

    index_no = int(wiki_articles_df[wiki_articles_df['name'] == name]['index_no'])
    start_year = ""
    
    warning_flag, search_term, search_url, wiki_text = override_query(search_term_transform2[name] )
    
    wiki_articles[index_no]['wiki_warning_flag'] = warning_flag
    wiki_articles[index_no]['wiki_search_term'] = search_term
    wiki_articles[index_no]['wiki_search_url'] = search_url
    wiki_articles[index_no]['wiki_text'] = wiki_text

In [27]:
#Outlier cases
bad_indices = [1218, 1506, 1931, 1956, 1981, 2193]
search_term_transform3 = ["Being Human (North American TV series)",
                          "Shameless (British TV series)",
                          "Rebelde",
                          "The Returned (French TV series)", 
                          "The Returned (American TV series)",
                          "Taken (miniseries)"]

for i in range(len(bad_indices)):
    
    index_no = bad_indices[i]
    start_year = ""
    
    warning_flag, search_term, search_url, wiki_text = override_query(search_term_transform3[i] )
    
    wiki_articles[index_no]['wiki_warning_flag'] = warning_flag
    wiki_articles[index_no]['wiki_search_term'] = search_term
    wiki_articles[index_no]['wiki_search_url'] = search_url
    wiki_articles[index_no]['wiki_text'] = wiki_text

In [28]:
#double-check for entries still pointing to disambiguation page
wiki_articles_df = pd.DataFrame.from_dict(wiki_articles, orient='index')
len(wiki_articles_df[wiki_articles_df['wiki_text'].str.contains("may refer to")])

0

#### Summarize & write output to csv/json

In [29]:
wiki_articles_df.head()

Unnamed: 0,index_no,name,wiki_warning_flag,wiki_search_term,wiki_search_url,wiki_text
0,0,Game of Thrones,1,Game of thrones (TV series),https://en.wikipedia.org/wiki/Game_of_thrones_...,Game of Thrones is an American fantasy drama t...
1,1,Prison Break,1,Prison break (TV series),https://en.wikipedia.org/wiki/Prison_break_(TV...,Prison Break is an American serial drama telev...
2,2,Vikings,0,Vikings (2013 TV series),https://en.wikipedia.org/wiki/Vikings_(2013_TV...,Vikings is a historical drama television serie...
3,3,The Boys,0,The Boys (2019 TV series),https://en.wikipedia.org/wiki/The_Boys_(2019_T...,The Boys is an American superhero television s...
4,4,The Mandalorian,1,The Mandalorian (TV series),https://en.wikipedia.org/wiki/The_Mandalorian_...,The Mandalorian is an American space Western t...


In [30]:
wiki_articles_df.shape

(2484, 6)

In [31]:
wiki_articles_df['wiki_warning_flag'].value_counts()

1    1070
2     995
0     337
4      80
3       2

In [32]:
wiki_articles_df.to_csv('../data/wiki_articles_df.csv', index=False)

In [33]:
wiki_articles_df.to_json('../data/wiki_articles_df.json')  #backup, in case the commas inside 'wiki_text' column throw off comma separation in csv

In [34]:
# test_read_in_csv = pd.read_csv('../data/wiki_articles_df.csv', index_col="index_no")
# test_read_in_csv.head()


In [35]:
# test_read_in_json = pd.read_json('../data/wiki_articles_df.json')
# test_read_in_json.head()