In [1]:
from os import listdir
from os.path import isfile, join
import pandas as pd

In [2]:
# This is a function to check whether the three ids - essay_id, essay_id_comp, and filename all point to the 
# same essays. It takes four arguments:
#
# dataframe_path - path to the main dataframe which contains fulltexts and essay_id
# filename_path - path to the folder with the numbered essay files
# essay_id_comp_path - path to the folder with the essay_id_comp labeled files
# key_path - path the the keyed dataframe
#
# It makes lists of full texts pulled using all three id systems, then checks those lists to make sure they are identical

def essayKeyChecker(dataframe_path, filename_path, essay_id_comp_path, key_path):
    essay_id_full_text = pd.read_csv(dataframe_path, low_memory=False)[['essay_id', 'full_text']].drop_duplicates()
    keyed_df = pd.read_csv(key_path)
    filename_docs=[]
    essay_id_docs=[]
    essay_id_comp_docs=[]
    for i in range(len(keyed_df)):
        essay_id_docs.append(essay_id_full_text[essay_id_full_text['essay_id']==keyed_df.iloc[i]['essay_id']]['full_text'].to_list()[0])
        with open(str(filename_path+str(keyed_df.iloc[i]['filename'])+'.txt')) as f:
            filename_docs.append(f.read())
        with open(str(essay_id_comp_path+keyed_df.iloc[i]['essay_id_comp']+'.txt')) as f:
            essay_id_comp_docs.append(f.read())
    print('checking alignment of {} full text files'.format(len(keyed_df)))
    print('problem at the following index:')
    for i in range(len(keyed_df)):
        if filename_docs[i]!=essay_id_docs[i] or filename_docs[i]!=essay_id_comp_docs[i] or essay_id_docs[i]!=essay_id_comp_docs[i]:
            print(i)
    print('done!')

In [3]:
# Running the function
essayKeyChecker('../persuade_corpus.csv', 'PERSUADE_TXT/', 'essay_files/', 'id_keys.csv')

checking alignment of 25996 full text files
problem at the following index:
done!


In [4]:
# A function to extract the full text from each file in a folder.  Takes a path to a folder as an argument
# and returns a dataframe with two columns: id is the file name and data is the full text.
def createTable(path):
    file_dict = {}
    ids = [f for f in listdir(path) if isfile(join(path, f))]
    for file in ids:
        with open(path+file) as f:
            data = f.read()
        file_dict[file] = data
    data_items = file_dict.items()
    data_list = list(data_items)
    df = pd.DataFrame(data_list)
    df.columns = ['id', 'data']
    return df

In [5]:
# Make two dataframes - number_list is indexed by the numerical ID in the folder Scott shared and essay_id_list 
# is indexed by essay_id_comp
number_list = createTable('PERSUADE_TXT/')
essay_id_list = createTable('essay_files/')

In [6]:
# Merge the two dataframes, change the column names, and strip the .txt from the end of the ids
combined_list = essay_id_list.merge(number_list, on="data")
combined_list.columns = ['essay_id_comp', 'full_text', 'filename']
combined_list['essay_id_comp'] = combined_list.essay_id_comp.str.replace('.txt', '')
combined_list['filename'] = combined_list.filename.str.replace('.txt', '')
combined_list.head()

Unnamed: 0,essay_id_comp,full_text,filename
0,8CE67A7B4BDF,Everyone has either given advice or been on th...,25207
1,1D549C77CB16,There are mny factors that influence using tec...,9957
2,5CEA51DD2414,Did you know that the majority of students cur...,7228
3,E53A89E4CAB5,"Dear, Senator\n\nI'am a conserend voter that b...",19196
4,120174AE39B7,Dear principal\n\nAs a student i believe that ...,21798


In [7]:
# get the essay_id and holistic_essay scores from the master dataframe. Combine them, check the length, and check
# for null values

master_df = pd.read_csv('../persuade_corpus.csv', low_memory=False)[['essay_id', 'full_text', 'holistic_essay_score']].drop_duplicates().reset_index(drop=True)
final_key_list = combined_list.merge(master_df, on='full_text').reset_index(drop=True)
print(len(final_key_list))
print(final_key_list.isnull().values.any())
final_key_list.head()

25996
False


Unnamed: 0,essay_id_comp,full_text,filename,essay_id,holistic_essay_score
0,8CE67A7B4BDF,Everyone has either given advice or been on th...,25207,AAAXMP138200000740562810_OR,4
1,1D549C77CB16,There are mny factors that influence using tec...,9957,AAATRP14318001044789,5
2,5CEA51DD2414,Did you know that the majority of students cur...,7228,AAAXMP138200001722342850_OR,4
3,E53A89E4CAB5,"Dear, Senator\n\nI'am a conserend voter that b...",19196,5353239,4
4,120174AE39B7,Dear principal\n\nAs a student i believe that ...,21798,2021004468,4


In [8]:
# Bring up the NLP indices (not including GAMAT), combine them with with the keyed df. Check for length, null values
NLPindices = pd.read_csv('PERSUADE_Combined_Results.csv',index_col = 0).reset_index(drop=True)
NLPindices.filename = NLPindices.filename.astype('str')
keyed_df = final_key_list[['essay_id', 'essay_id_comp', 'holistic_essay_score', 'filename']].merge(NLPindices, on='filename')
print(len(keyed_df))
print(keyed_df.isnull().values.any())
keyed_df.head()

25996
False


Unnamed: 0,essay_id,essay_id_comp,holistic_essay_score,filename,nwords,Admiration/Awe_GALC,Amusement_GALC,Anger_GALC,Anxiety_GALC,Beingtouched_GALC,...,acad_av_lemma_construction_freq_log_stdev,news_av_lemma_freq_log_stdev,news_av_construction_freq_log_stdev,news_av_lemma_construction_freq_log_stdev,mag_av_lemma_freq_log_stdev,mag_av_construction_freq_log_stdev,mag_av_lemma_construction_freq_log_stdev,fic_av_lemma_freq_log_stdev,fic_av_construction_freq_log_stdev,fic_av_lemma_construction_freq_log_stdev
0,AAAXMP138200000740562810_OR,8CE67A7B4BDF,4,25207,401,0.0,0.0,0.0,0.0,0.0,...,53350.727204,489237.15468,188269.713837,69194.44459,523141.169702,215618.093412,74087.177064,585607.785392,189674.745932,82362.888418
1,AAATRP14318001044789,1D549C77CB16,5,9957,519,0.0,0.001927,0.003854,0.0,0.001927,...,84982.005646,601951.944262,234230.789288,103589.56852,645840.947315,262549.754574,110424.796041,723309.224262,242698.473784,132408.480219
2,AAAXMP138200001722342850_OR,5CEA51DD2414,4,7228,372,0.0,0.0,0.0,0.0,0.0,...,46829.731988,439442.732543,184474.738265,73899.413069,471573.417663,198353.207256,74302.332894,528110.237696,213493.762108,84516.752756
3,5353239,E53A89E4CAB5,4,19196,473,0.0,0.002114,0.0,0.0,0.002114,...,112610.880402,705367.885638,213947.177799,149410.753621,758839.224575,233121.873893,158628.987693,841665.358075,239855.646653,178841.113231
4,2021004468,120174AE39B7,4,21798,211,0.0,0.0,0.0,0.0,0.0,...,91779.428266,714543.266075,134882.302618,112743.434223,765949.242037,137868.174323,117844.608539,860698.191333,171388.047766,140480.780184


In [9]:
# Load and clean up the GAMAT results. These are keyed to essay_id_comp, but that shouldn't be a problem.
GAMATresults = pd.read_csv('GAMATresults.csv')
GAMATresults['filename'] = GAMATresults['filename'].str.replace('C:/Users/wmorris5/Desktop/essay_files', '')
GAMATresults['filename'] = GAMATresults['filename'].str[1:]
GAMATresults['filename'] = GAMATresults['filename'].str.replace('.txt', '')
GAMATresults = GAMATresults.rename(columns={'filename':'essay_id_comp'})
GAMATresults.head()

Unnamed: 0,essay_id_comp,error_count,word_count,error_count_per_100_words,duplication,duplication_per_100_words,grammar,grammar_per_100_words,misspelling,misspelling_per_100_words,...,WORST_THAN,WORTH_WHILE,WRONG_APOSTROPHE,YOULL_WILL,YOUR,YOURS_APOSTROPHE,YOUR_NN,YOUR_S,YOUR_SHOULD,YOU_THING
0,0000D23A521A,25,251,9.960159,0,0.0,0,0.0,25,9.960159,...,0,0,0,0,0,0,0,0,0,0
1,00066EA9880D,110,646,17.027864,0,0.0,0,0.0,110,17.027864,...,0,0,0,0,0,0,0,0,0,0
2,0006ED03C701,70,265,26.415094,0,0.0,0,0.0,40,15.09434,...,0,0,0,0,0,0,0,0,0,0
3,000A58BC095E,15,254,5.905512,0,0.0,10,3.937008,5,1.968504,...,0,0,0,0,0,0,0,0,0,0
4,000BAD50D026,120,386,31.088083,0,0.0,70,18.134715,20,5.181347,...,0,0,0,0,0,0,0,0,0,0


In [10]:
# Merge the GAMAT results onto our keyed_df dataframe. Check everything
keyed_df = keyed_df.merge(GAMATresults, on='essay_id_comp')
print(len(keyed_df))
print(keyed_df.isnull().values.any())
keyed_df.head()

25996
False


Unnamed: 0,essay_id,essay_id_comp,holistic_essay_score,filename,nwords,Admiration/Awe_GALC,Amusement_GALC,Anger_GALC,Anxiety_GALC,Beingtouched_GALC,...,WORST_THAN,WORTH_WHILE,WRONG_APOSTROPHE,YOULL_WILL,YOUR,YOURS_APOSTROPHE,YOUR_NN,YOUR_S,YOUR_SHOULD,YOU_THING
0,AAAXMP138200000740562810_OR,8CE67A7B4BDF,4,25207,401,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
1,AAATRP14318001044789,1D549C77CB16,5,9957,519,0.0,0.001927,0.003854,0.0,0.001927,...,0,0,0,0,0,0,0,0,0,0
2,AAAXMP138200001722342850_OR,5CEA51DD2414,4,7228,372,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,5353239,E53A89E4CAB5,4,19196,473,0.0,0.002114,0.0,0.0,0.002114,...,0,0,0,0,0,0,0,0,0,0
4,2021004468,120174AE39B7,4,21798,211,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [8]:
# Write the full dataframe to a csv
keyed_df.to_csv('PERSUADE_NLP_indices.csv')

problem at the following index:
done!
