In [1]:
cd /Users/nte/Documents/Chicago\ PhD\ Projects/Political\ culture\ maps/1.ANALYSIS


/Users/nte/Documents/Chicago PhD Projects/Political culture maps/1.ANALYSIS


!/usr/bin/env python coding: utf-8
Culture Maps Project

## Author: Nicolás Torres-Echeverry
## Created: July 2021
## Date(last modified): July 20th 2023
## Data storage: 
    ### Merges website's source data (from csv) and text data dictionary (from json) and saves it as a csv. 
    ### It allows to create different data frames filtering by the number of words (e.g., all websites with less than 3k)
## Working with El Corpus del Español

## Notebook index:
    # 1. Libraries
    # 2. Helper functions
    # 3. Pipeline
    # 4. Save data frame as csv

# 1. Libraries

In [2]:
# # 1. Libraries 

import pandas as pd
import json
import csv


# 2. Helper functions

In [3]:
def creates_df(source_list, websites_text, max_words, full_data=False):
    '''
    Merges sources data and websites text data from dictionary (json) and creates a 
    pandas dataframe. 
    
    Inpputs:
        source_list: (list) websites urls
        websites_text: (dictionary) 
                    key - (int) id
                    value - (string) websites' texts 
        max_words: (int) creates cut off depending on the number of words per website
                    e.g., if checkpoing 1K => no website with more than 1K words enters
        full_data: (boolean) defines data output 
        
    Output:
        If argument full_data TRUE => returns the entire data frame with NAs in text cells that it did not append.
            => span_df (pandas data frame) 
        If argument full_data FALSE => returns the entire data frame with NAs in text cells that it did not append.
             => smaller_df (pandas data frame)
        Defalt is FALSE.
        
    '''
    span_df = source_list
    
    span_df['text'] = ''
    span_df['id_string'] = span_df['textid'].astype(str)

    c = 0
    for index, row in span_df.iterrows():
        textid_value = row['id_string']
        if textid_value in websites_text and len(websites_text[textid_value]) < max_words:
            c = c + 1
            span_df.at[index, 'text'] = websites_text[textid_value]
        else:
            span_df.at[index, 'text'] = 'NA'

    if full_data:
        print("RETURNING FULL DATA FRAME")
        return span_df
    else:
        print("RETURNING SMALL DATA FRAME. NO NAs.") 
        smaller_df = span_df[span_df['text'] != 'NA']
        print("Number of website's texts incuded in data frame:", c) 
        return smaller_df

In [4]:
def count_matches(source_list, websites_text):
    '''
    Inpputs:
        source_list: (list) websites urls
        websites_text: (dictionary) 
                    key - (int) id
                    value - (string) websites' texts 
    Output:
        c: (int) count of matches
    '''
    
    c = 0
    
    for index, row in source_list.iterrows():
        textid_value = row['id_string']
        if textid_value in websites_text:
            c = c + 1

    print("Number of matches: ids both in dicitonary and source:", c)

    return c
    

# 3. Pipeline

In [5]:
# Reading json 

json_file = "dict_span_full.json"

with open(json_file, "r") as read_file:
    dict_text_websites = json.load(read_file)


In [6]:
# File path
file_path = 'sources_csv.csv'

# Read the CSV file into a pandas DataFrame
df_sources = pd.read_csv(file_path)

In [7]:
df_sources.columns

Index(['textid', 'words_number', 'genre', 'country', 'website', 'url',
       'title'],
      dtype='object')

In [8]:
df_sources.shape

(2096913, 7)

In [9]:
df = creates_df(df_sources, dict_text_websites, 1000, full_data=False)

RETURNING SMALL DATA FRAME. NO NAs.
Number of website's texts incuded in data frame: 223830


In [10]:
df.shape

(223830, 9)

#### I was able to run the funciton and create an smaller data frame for websites with less than 1,000 words.

In [12]:
df_3k = creates_df(df_sources, dict_text_websites, 3000, full_data=False)
df.shape

RETURNING SMALL DATA FRAME. NO NAs.
Number of website's texts incuded in data frame: 1016208


(223830, 9)

### Great! this is a larger dataframe than the one I had. It has 1,016,208

In [None]:
# checkpoint (if needed)

# count = count_matches(df_sources, dict_text_websites) 

### Interesting! From 21593 websites in the dictionary 21582 make match in the source!

# 4. Saves data frame

In [11]:
df.to_csv('span_full_text_1k_words.csv')

In [13]:
df_3k.to_csv('span_full_text_3k_words.csv')