In [18]:
import os
import pandas as pd
import chardet
import json
from pathlib import Path
from sqlalchemy import create_engine
import subprocess

current_directory = os.getcwd()


In [2]:
def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        result = chardet.detect(f.read(10000))
    return result["encoding"]

def convert_utf(file_name):
    file_path = f"data/nc/{file_name}.csv"
    detected_encoding = detect_encoding(file_path)
    
    df = pd.read_csv(file_path, encoding=detected_encoding, sep="\t")
    
    output_path = f"data/nc/{file_name}_utf8.csv"
    df.to_csv(output_path, encoding="utf-8", index=False)

file_name_list = ['nc_polling_place_20121106', 'nc_polling_place_20161108', 'nc_polling_place_20201103']

for file in file_name_list:
    convert_utf(file)


In [3]:
with open(Path("~/pg_db_config.json").expanduser(), "r") as f:
    db_params = json.load(f)

engine = create_engine(f"postgresql://{db_params['user']}:{db_params['password']}@{db_params['host']}:{db_params['port']}/{db_params['dbname']}")

def run_query(query):
    df = pd.read_sql(query, engine)
    return df

query1 = """
SELECT * 
FROM analysis.nc_2016
"""

df = run_query(query1)

In [4]:

input_file = "/Users/robertness/projects/polling_locations/data/nc/VR_Snapshot_20161108.txt"
output_file = "/Users/robertness/projects/polling_locations/data/nc/VR_Snapshot_20161108_utf8.txt"

# # Ensure the output directory exists
# output_dir = os.path.dirname(output_file)
# os.makedirs(output_dir, exist_ok=True)

# if not os.path.exists(output_file):
#     open(output_file, 'w').close()  # Creates an empty file

# # Run the iconv command inside Python
# try:
#     subprocess.run(["iconv", "-f", "UTF-16", "-t", "UTF-8", input_file, "-o", output_file], check=True)
#     print("Process complete! File converted successfully to UTF-8.")
# except subprocess.CalledProcessError as e:
#     print(f"Error converting file: {e}")



# print("Process complete!")
# os.system("say 'Process complete!'")  # Uses macOS built-in speech



In [17]:
df = pd.read_csv("/Users/robertness/projects/polling_locations/data/nc/voter_stats_20161108.txt", sep="\t")
# df.to_csv('vr_snapshot_preview.csv',index=False)
df.columns.tolist()
# df

['county_desc',
 'election_date',
 'stats_type',
 'precinct_abbrv',
 'vtd_abbrv',
 'party_cd',
 'race_code',
 'ethnic_code',
 'sex_code',
 'age',
 'total_voters',
 'update_date']

In [6]:
"""
iconv -f UTF-16 -t UTF-8 /Users/robertness/projects/polling_locations/data/nc/VR_Snapshot_20201103.txt | tr -d '\000' > /Users/robertness/projects/polling_locations/data/nc/VR_Snapshot_20201103_clean.txt

wc -l /Users/robertness/projects/polling_locations/data/nc/VR_Snapshot_20161108.txt

wc -l /Users/robertness/projects/polling_locations/data/nc/VR_Snapshot_20161108_clean.txt

iconv -f UTF-16 -t UTF-8 /Users/robertness/projects/polling_locations/data/nc/voter_stats_20161108.txt | tr -d '\000' > /Users/robertness/projects/polling_locations/data/nc/voter_stats_20161108_clean.txt

iconv -f UTF-16 -t UTF-8 /Users/robertness/projects/polling_locations/data/nc/voter_stats_20161108.txt | tr -d '\000' > /Users/robertness/projects/polling_locations/data/nc/voter_stats_20161108_fixed.txt

"""

'\niconv -f UTF-16 -t UTF-8 /Users/robertness/projects/polling_locations/data/nc/VR_Snapshot_20161108.txt > /Users/robertness/projects/polling_locations/data/nc/VR_Snapshot_20161108_utf8.txt\n'

In [19]:
file_path = "/Users/robertness/projects/polling_locations/data/nc/voter_stats_20161108.txt"

with open(file_path, "rb") as f:
    raw_data = f.read(100000)  # Read first 100KB of the file
    result = chardet.detect(raw_data)
    encoding = result["encoding"]

print(f"Detected encoding: {encoding}")


Detected encoding: ascii


In [34]:
import os
import pandas as pd
import chardet
import subprocess
from pathlib import Path


current_directory = os.getcwd()


data_directory = os.path.join(current_directory, "polling_locations/data/nc")


def detect_encoding(file_path):
    with open(file_path, "rb") as f:
        result = chardet.detect(f.read(10000))
    return result["encoding"]


def remove_null_bytes(file_name):
    file_path = os.path.join(data_directory, f"{file_name}.txt")
    

    if not os.path.exists(file_path):
        print(f"File not found: {file_path}")
        return


    temp_file = file_path + "_clean"

    with open(file_path, "rb") as f_in, open(temp_file, "wb") as f_out:
        for line in f_in:
            f_out.write(line.replace(b"\x00", b""))


    os.replace(temp_file, file_path)
    print(f"NULL bytes removed from: {file_path}")


def process_csv(file_name):
    file_path = os.path.join(data_directory, f"{file_name}.txt")


    remove_null_bytes(file_name)


    detected_encoding = detect_encoding(file_path)
    

    df = pd.read_csv(file_path, encoding=detected_encoding, sep="\t")
    print(f"Successfully read {file_name}.txt with encoding: {detected_encoding}")
    
    return df


file_name_list = [
    "voter_stats_20161108",
    "voter_stats_20201103",
    "voter_stats_20241105"
]


for file in file_name_list:
    process_csv(file)

