In [1]:
import os
import pandas as pd
import csv
import glob
import logging

In [2]:
logging.basicConfig(filename="error_log.txt", level=logging.ERROR, format="%(asctime)s - %(message)s")

In [3]:
def read_tsv_safely(file_path):
    """
    Reads a TSV file safely, logging errors if they occur.
    """
    try:
        df = pd.read_csv(
            file_path,
            sep='\t',
            engine='python',
            quoting=3,  # Avoid strict quoting issues
            on_bad_lines="skip",  # Skip problematic lines
        )
        return df
    except Exception as e:
        logging.error(f"Error reading {file_path}: {e}")
        print(f"Error reading {file_path}: {e}")
        return None

def combine_tsv_files(folder_path):
    """
    Combines all .tsv files in the specified folder into a single DataFrame.
    """
    tsv_files = [f for f in os.listdir(folder_path) if f.endswith('.tsv')]
    combined_dfs = []

    for file in tsv_files:
        file_path = os.path.join(folder_path, file)
        print(f"Processing: {file}")
        df = read_tsv_safely(file_path)
        if df is not None:
            print(f"Number of rows in {file}: {len(df)}")
            combined_dfs.append(df)
        else:
            print(f"Skipping {file} due to errors.")

    if combined_dfs:
        combined_df = pd.concat(combined_dfs, ignore_index=True)
        return combined_df
    else:
        print("No valid data to combine.")
        return pd.DataFrame()

In [4]:
folder_path = "../data/govdirectory"
output_file = "../data/combined_tsv_data.tsv"

combined_df = combine_tsv_files(folder_path)

Processing: Ukraine.tsv
Number of rows in Ukraine.tsv: 48
Processing: East Timor.tsv
Number of rows in East Timor.tsv: 565
Processing: Iceland.tsv
Number of rows in Iceland.tsv: 85
Processing: Ghana.tsv
Number of rows in Ghana.tsv: 306
Processing: Russia.tsv
Number of rows in Russia.tsv: 23
Processing: Bermuda.tsv
Number of rows in Bermuda.tsv: 25
Processing: Malaysia.tsv
Number of rows in Malaysia.tsv: 206
Processing: Switzerland.tsv
Number of rows in Switzerland.tsv: 2161
Processing: Nepal.tsv
Number of rows in Nepal.tsv: 113
Processing: Cameroon.tsv
Number of rows in Cameroon.tsv: 51
Processing: Sweden.tsv
Number of rows in Sweden.tsv: 776
Processing: Belgium.tsv
Number of rows in Belgium.tsv: 972
Processing: Latvia.tsv
Number of rows in Latvia.tsv: 97
Processing: Austria.tsv
Number of rows in Austria.tsv: 2209
Processing: South Africa.tsv
Number of rows in South Africa.tsv: 323
Processing: Portugal.tsv
Number of rows in Portugal.tsv: 347
Processing: Luxembourg.tsv
Number of rows in

In [5]:
combined_df.head()

Unnamed: 0,"""Name""","""Govdirectory URL""","""Type""","""Website"""
0,"""Cherkasy Oblast""","""https://www.govdirectory.org/ukraine/Q161808/""","""oblast of Ukraine""","""https://www.oblradack.gov.ua/"""
1,"""""Chernihiv Oblast""","""https://www.govdirectory.org/ukraine/Q167874/""","""oblast of Ukraine""","""https://cg.gov.ua/"""
2,"""""Chernivtsi Oblast""","""https://www.govdirectory.org/ukraine/Q168856/""","""oblast of Ukraine""","""https://bukoda.gov.ua/"""
3,"""""Dnipropetrovsk Oblast""","""https://www.govdirectory.org/ukraine/Q170672/""","""oblast of Ukraine""","""http://www.adm.dp.gov.ua/"""
4,"""""Donetsk Oblast""","""https://www.govdirectory.org/ukraine/Q2012050/""","""oblast of Ukraine""","""https://dn.gov.ua/"""


In [6]:
combined_df.shape

(12255, 4)

In [7]:
def strip_quotes_from_dataframe(df: pd.DataFrame) -> pd.DataFrame:
    """
    Strips quotes from all string values and column names in a DataFrame.
    
    Parameters:
        df (pd.DataFrame): Input DataFrame with possible quoted values and column names.
        
    Returns:
        pd.DataFrame: DataFrame with quotes stripped from all string values and column names.
    """
    # Remove quotes from column names
    df.columns = [col.strip('"') for col in df.columns]
    
    # Remove quotes from values
    return df.applymap(lambda x: x.strip('"') if isinstance(x, str) else x)

In [8]:
strip_quotes_from_dataframe(combined_df)

Unnamed: 0,Name,Govdirectory URL,Type,Website
0,Cherkasy Oblast,https://www.govdirectory.org/ukraine/Q161808/,oblast of Ukraine,https://www.oblradack.gov.ua/
1,Chernihiv Oblast,https://www.govdirectory.org/ukraine/Q167874/,oblast of Ukraine,https://cg.gov.ua/
2,Chernivtsi Oblast,https://www.govdirectory.org/ukraine/Q168856/,oblast of Ukraine,https://bukoda.gov.ua/
3,Dnipropetrovsk Oblast,https://www.govdirectory.org/ukraine/Q170672/,oblast of Ukraine,http://www.adm.dp.gov.ua/
4,Donetsk Oblast,https://www.govdirectory.org/ukraine/Q2012050/,oblast of Ukraine,https://dn.gov.ua/
...,...,...,...,...
12250,North West Community Development Council,https://www.govdirectory.org/singapore/Q5784118/,Community Development Council,https://www.cdc.gov.sg/northwest
12251,South East Community Development Council,https://www.govdirectory.org/singapore/Q1687545/,Community Development Council,
12252,South West Community Development Council,https://www.govdirectory.org/singapore/Q5784126/,Community Development Council,https://www.cdc.gov.sg/southwest
12253,Parliament of Singapore,https://www.govdirectory.org/singapore/Q1517231/,parliament,https://www.parliament.gov.sg/


In [9]:
strip_quotes_from_dataframe(combined_df).to_csv(output_file, sep='\t', index=False)