In [1]:
# libraries
import pandas as pd
from tqdm.auto import tqdm
import numpy as np
import re
import requests
import json
import os
import matplotlib.pyplot as plt

In [2]:
# read in data
train_df = pd.DataFrame(pd.read_csv('../resources/train.csv'))
test_df = pd.DataFrame(pd.read_csv('../resources/sample_submission.csv'))
train_df.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study


In [3]:
# kaggle functions
def clean_text(txt):
    return re.sub('[^A-Za-z0-9]+', ' ', str(txt).lower()).strip()

In [4]:
tqdm.pandas()

study_text = []

for index, row in train_df.iterrows():
    filename = f"{row['Id']}.json"
    text = ''
    with open(f'../resources/train/{filename}') as jsonfile:
        data = json.load(jsonfile)
    for i in data:
        text += f"Section title: {i['section_title']} | Text: {i['text']} "
    study_text.append(text)

train_df['text'] = study_text

train_df.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,text
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title what is this study about text th...
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title november 2004 text dropping out ...
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title differences in outcomes for fema...
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title abstract text federal reserve ba...
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,section title abstract text this article inves...


In [8]:
tqdm.pandas()

# add all dataset labels in training set to list 
dataset_labels = [x.lower() for x in train_df['dataset_label'].unique()]
dataset_titles = [x.lower() for x in train_df['dataset_title'].unique()]
cleaned_labels = [x.lower() for x in train_df['cleaned_label'].unique()]
label_list = set(dataset_labels + dataset_titles + cleaned_labels)

all_labels = []

for index, row in train_df.iterrows():
    all_labels.append('|'.join([label for label in label_list if label in row['text']]).strip())

train_df['all_labels'] = all_labels

train_df.head()

Unnamed: 0,Id,pub_title,dataset_title,dataset_label,cleaned_label,text,all_labels
0,d0fa7568-7d8e-4db9-870f-f9c6f668c17b,The Impact of Dual Enrollment on College Degre...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,what is this study about this study used data ...,education longitudinal study|national educatio...
1,2f26f645-3dec-485d-b68d-f013c9e05e60,Educational Attainment of High School Dropouts...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,november 2004 dropping out of high school is n...,education longitudinal study|national educatio...
2,c5d5cd2c-59de-4f29-bbb1-6a88c7b52f29,Differences in Outcomes for Female and Male St...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,differences in outcomes for female and male st...,education longitudinal study|national educatio...
3,5c9a3bc9-41ba-4574-ad71-e25c1442c8af,Stepping Stone and Option Value in a Model of ...,National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,abstract federal reserve bank of richmond s1 a...,education longitudinal study|national educatio...
4,c754dec7-c5a3-4337-9892-c02158475064,"Parental Effort, School Resources, and Student...",National Education Longitudinal Study,National Education Longitudinal Study,national education longitudinal study,abstract this article investigates an importan...,education longitudinal study|national educatio...


In [9]:
# save combined df
train_df.to_csv('../resources/combined_data/train_with_text_labels.csv', index=False)

In [None]:
tqdm.pandas()

# iterate through DF and add text and section_titles to new column
study_text_test = []
for index, row in test_df.iterrows():
    filename = f"{row['Id']}.json"
    text = ''
    with open(f'../resources/test/{filename}') as jsonfile:
        data = json.load(jsonfile)
    for i in data:
        text += f"Section title: {i['section_title']} | Text: {i['text']} "
    study_text_test.append(clean_text(text))

test_df['text'] = study_text_test

In [11]:
# save combined df
test_df.to_csv('../resources/combined_data/test_with_text.csv', index=False)