In [4]:
# libraries
import pandas as pd
import numpy as np
import re
import requests
import json
from pprint import pprint
import os
import matplotlib.pyplot as plt

In [5]:
# read in data
train_df = pd.DataFrame(pd.read_csv('../resources/train.csv'))
test_df = pd.DataFrame(pd.read_csv('../resources/sample_submission.csv'))
train_df.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study


In [6]:
# kaggle functions
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [7]:
train_df['text'] = ''

for index, row in train_df.iterrows():
    filename = f"{row['Id']}.json"
    text = ''
    with open(f'../resources/train/{filename}') as jsonfile:
        data = json.load(jsonfile)
    for i in data:
        text += f"Section title: {i['section_title']} | Text: {i['text']} "
    row['text'] = clean_text(text)

train_df.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,text
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title what is this study about text th...
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title november 2004 text dropping out ...
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title differences in outcomes for fema...
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title abstract text federal reserve ba...
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title abstract text this article inves...


In [8]:
# add all dataset labels in training set to list 
datasets_titles = train_df.cleaned_label.unique()

# create all labels column for multi-label articles
labels = []
for index in train_df['Id']:
    text = train_df[train_df['Id'] == index].text.str.cat(sep='\n').lower()
    label = []
    for dataset_title in datasets_titles:
        if dataset_title in text:
            label.append(clean_text(dataset_title))
    labels.append('|'.join(label))

train_df['all_labels'] = labels

train_df.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,text,all_labels
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title what is this study about text th...,national education longitudinal study|educatio...
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title november 2004 text dropping out ...,national education longitudinal study|educatio...
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title differences in outcomes for fema...,national education longitudinal study|educatio...
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title abstract text federal reserve ba...,national education longitudinal study|educatio...
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title abstract text this article inves...,national education longitudinal study|educatio...


In [12]:
# save combined df
train_df.to_csv('../resources/combined_data/train_with_text.csv', index=False)

In [13]:
import time
# iterate through DF and add text and section_titles to new column
test_df['text'] = ''
for index, row in test_df.iterrows():
    filename = f"{row['Id']}.json"
    text = ''
    with open(f'../resources/test/{filename}') as jsonfile:
        data = json.load(jsonfile)
    for i in data:
        text += f"Section title: {i['section_title']} | Text: {i['text']} "
    test_df.at[index,'text'] = clean_text(text)
    # wait to prevent data rate error
    time.sleep(.5)   
test_df

Unnamed: 0,Id,PredictionString,text
0,2100032a-7c33-4bff-97ef-690822c43466,,section title abstract text cognitive deficits...
1,2f392438-e215-4169-bebf-21ac4ff253e1,,section title introduction text this report de...
2,3f316b38-1a24-45a9-8d8c-4e05a42257c6,,section title introduction text cape hatteras ...
3,8e6996b4-ca08-4c0b-bed2-aaf07a4c6a60,,section title introduction text a significant ...


In [14]:
# save combined df
test_df.to_csv('../resources/combined_data/test_with_text.csv', index=False)