This notebook takes the articles and citations in .txt files in the data folder and turns them into either tsv or csv

## Imports

In [11]:
import pandas as pd
import numpy as np

import os
import fnmatch
import json

In [20]:
text_files_path ='../Data/textes_articles'
citation_files_path = '../Data/output'
# os.chdir(text_files_path) 
# os.getcwd()

In [14]:
os.getcwd()

'C:\\Users\\Younes\\Documents\\GitHub\\SHS-groupe8\\Code'

## Txt articles to df

### Functions

In [4]:
def articles_to_df(text_files_path):
    current_dir = os.getcwd()
    os.chdir(text_files_path) 
    
    file_list = [] # init empty list
    
    # get a list of the file names
    for file in os.listdir('.'):  # 
        if fnmatch.fnmatch(file, '*.txt'):
            file_list.append(file)
    
    article_dict = {} # init empty dict
    
    # fill dict with article ID as "key" and file content as "value"
    for file_name in file_list:
        with open(file_name, "r",encoding='utf8') as f: # , errors='ignore' if there are errors with encoding
            article_dict[file_name[:-4]] = ' '.join(f.readlines()) 
 
    # turn the dict into a dataframe
    df_new = pd.DataFrame.from_dict(article_dict,orient='index')
    df_new.reset_index(inplace=True)
    df_new.rename(columns={'index': 'externalid', 0: 'text'},inplace=True)
    df_new['externalid'] = df_new['externalid'].apply(lambda x : int(x))
    
    
    # print(len(article_dict))
    # print(len(file_list))
    
    os.chdir(current_dir) 
    return df_new

In [None]:
df_articles = articles_to_df(text_files_path)

# sanity checks
print(len(os.listdir(text_files_path)))
print(len(df_articles))

In [10]:
df_articles.head()

Unnamed: 0,externalid,text
0,100020966494,Des écharpes de brume serpentent sur l’autorou...
1,100103147779,"Genève, 22 février Depuis quelque temps, il y..."
2,100200769950,Un fil de fer tendu sur un étroit sentier aux ...
3,100286240311,"En novembre 2000, le journal «L’Équipe» avait ..."
4,100432865526,La soirée promet d’être électrique ce soir au ...


## Txt citations to df

In [25]:
def citations_to_df(citation_files_path):
    current_dir = os.getcwd()
    os.chdir(citation_files_path) 
    
    file_list = [] # init empty list

    # get a list of the file names
    for file in os.listdir('.'):  # 
        if fnmatch.fnmatch(file, '*.json'):
            file_list.append(file)

    citations_list = [] # init empty dict
    empty_files = []
    for citation_file_name in file_list:
        with open(citation_file_name, 'r', encoding='utf8') as json_data:
            # list of dicts in each file
            try :
                temp_citation_list = json.load(json_data)
            except :
                print(citation_file_name, 'is empty')
                empty_files.append(citation_file_name)

            # add article id to each citation
            for citation in temp_citation_list:
                citation['externalid'] = int(citation_file_name[:-5])

            citations_list = citations_list + temp_citation_list
    
    df = pd.DataFrame(citations_list)
    os.chdir(current_dir) 
    return df

In [26]:
df_citations = citations_to_df(citation_files_path)

43862564.json is empty


In [27]:
df_citations.head()

Unnamed: 0,speaker,speaker_index,quote,quote_index,verb,verb_index,quote_token_count,quote_type,is_floating_quote,reference,speaker_gender,externalid
0,Marc Ogorek,"(1377, 1388)",ne pas faire tout cela par plaisir,"(1396, 1430)",assure,"(1389, 1395)",6,SVC,False,Marc Ogorek,male,100020966494
1,Cheveux frisés et sourcils broussailleux,"(2955, 2996)","qu’en temps normal, il réalise trois quarts de...","(3079, 3210)",explique,"(3070, 3078)",23,SVC,False,Cheveux frisés et sourcils broussailleux,unknown,100020966494
2,l’un d’eux,"(3401, 3411)","""Mais au-delà, c’est toute la réciprocité des ...","(3310, 3395)",dit,"(3397, 3400)",18,QCQVS,False,l’un d’eux,unknown,100020966494
3,qui,"(1095, 1098)",L’hôpital de Genève vous remercie!,"(980,1014)",,,5,QCQVS,False,,unknown,100020966494
4,-il,"(1547, 1550)",tonne en direction des manifestants une autom...,"(1015,1517)",,,80,QCQVS,False,,male,100020966494


In [29]:
df_citations.to_csv('../Data/citations.csv', index=True)

In [30]:
df_citations_disk = pd.read_csv('../Data/citations.csv', index_col=0)

In [31]:
df_citations_disk.head()

Unnamed: 0,speaker,speaker_index,quote,quote_index,verb,verb_index,quote_token_count,quote_type,is_floating_quote,reference,speaker_gender,externalid
0,Marc Ogorek,"(1377, 1388)",ne pas faire tout cela par plaisir,"(1396, 1430)",assure,"(1389, 1395)",6,SVC,False,Marc Ogorek,male,100020966494
1,Cheveux frisés et sourcils broussailleux,"(2955, 2996)","qu’en temps normal, il réalise trois quarts de...","(3079, 3210)",explique,"(3070, 3078)",23,SVC,False,Cheveux frisés et sourcils broussailleux,unknown,100020966494
2,l’un d’eux,"(3401, 3411)","""Mais au-delà, c’est toute la réciprocité des ...","(3310, 3395)",dit,"(3397, 3400)",18,QCQVS,False,l’un d’eux,unknown,100020966494
3,qui,"(1095, 1098)",L’hôpital de Genève vous remercie!,"(980,1014)",,,5,QCQVS,False,,unknown,100020966494
4,-il,"(1547, 1550)",tonne en direction des manifestants une autom...,"(1015,1517)",,,80,QCQVS,False,,male,100020966494


In [32]:
df_citations_disk.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 72259 entries, 0 to 72258
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   speaker            66960 non-null  object
 1   speaker_index      66960 non-null  object
 2   quote              72259 non-null  object
 3   quote_index        72259 non-null  object
 4   verb               47221 non-null  object
 5   verb_index         47221 non-null  object
 6   quote_token_count  72259 non-null  int64 
 7   quote_type         72259 non-null  object
 8   is_floating_quote  72259 non-null  bool  
 9   reference          65282 non-null  object
 10  speaker_gender     72259 non-null  object
 11  externalid         72259 non-null  int64 
dtypes: bool(1), int64(2), object(9)
memory usage: 6.7+ MB
