<a href="https://colab.research.google.com/github/zhaodrago-cell/Project-1/blob/main/01_Data_Cleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import os, zipfile, shutil, textwrap, re
from pathlib import Path

In [3]:
PROJECT = "ck1994-njpa-minwage-replication"
ZIP_PATH = "/mnt/data/njmin.zip"

In [9]:
import os
import zipfile
from pathlib import Path

project_folder = "/content/replication_project"
raw_folder = f"{project_folder}/data/raw"

Path(raw_folder).mkdir(parents=True, exist_ok=True)
print(" Folder created:", raw_folder)

zip_url = "https://davidcard.berkeley.edu/data_sets/njmin.zip"
zip_path = f"{raw_folder}/njmin.zip"

!wget -q -O "$zip_path" "$zip_url"

if not os.path.exists(zip_path):
    raise FileNotFoundError(f"Download failed: {zip_path}")

print(" Data download complete:", zip_path)

with zipfile.ZipFile(zip_path, "r") as z:
    z.extractall(raw_folder)

print(" Decompression complete")

print("\n raw folder contents:")
for file in sorted(os.listdir(raw_folder)):
    print(" -", file)

 Folder created: /content/replication_project/data/raw
 Data download complete: /content/replication_project/data/raw/njmin.zip
 Decompression complete

 raw folder contents:
 - check.sas
 - codebook
 - njmin.zip
 - public.dat
 - read.me
 - survey1.nj
 - survey2.nj


In [11]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

project_folder = "/content/replication_project"
raw_folder = Path(project_folder) / "data" / "raw"

public_path = raw_folder / "public.dat"
codebook_path = raw_folder / "codebook"

codebook_text = codebook_path.read_text(encoding="latin1", errors="replace")

pattern = re.compile(r'^([A-Z0-9_]+)\s+(\d+)\s+(\d+)\s+([0-9.]+)\s+(.*)$')

vars_ = []
for line in codebook_text.splitlines():
    m = pattern.match(line.strip())
    if m:
        name, start, end, fmt, expl = m.groups()
        vars_.append((name, int(start), int(end)))

print(" Parsed variables:", len(vars_))
print(" First 10:", vars_[:10])

colspecs = [(start - 1, end) for (name, start, end) in vars_]
names = [name for (name, start, end) in vars_]

df_raw = pd.read_fwf(public_path, colspecs=colspecs, names=names)

print(" df_raw shape:", df_raw.shape)
df_raw.head()

 Parsed variables: 46
 First 10: [('SHEET', 1, 3), ('CHAIN', 5, 5), ('CO_OWNED', 7, 7), ('STATE', 9, 9), ('SOUTHJ', 11, 11), ('CENTRALJ', 13, 13), ('NORTHJ', 15, 15), ('PA1', 17, 17), ('PA2', 19, 19), ('SHORE', 21, 21)]
 df_raw shape: (410, 46)


Unnamed: 0,SHEET,CHAIN,CO_OWNED,STATE,SOUTHJ,CENTRALJ,NORTHJ,PA1,PA2,SHORE,...,FIRSTIN2,SPECIAL2,MEALS2,OPEN2R,HRSOPEN2,PSODA2,PFRY2,PENTREE2,NREGS2,NREGS112
0,46,1,0,0,0,0,0,1,0,0,...,0.08,1,2,6.5,16.5,1.03,.,0.94,4,4
1,49,2,0,0,0,0,0,1,0,0,...,0.05,0,2,10.0,13.0,1.01,0.89,2.35,4,4
2,506,2,1,0,0,0,0,1,0,0,...,0.25,.,1,11.0,11.0,0.95,0.74,2.33,4,3
3,56,4,1,0,0,0,0,1,0,0,...,0.15,0,2,10.0,12.0,0.92,0.79,0.87,2,2
4,61,4,1,0,0,0,0,1,0,0,...,0.15,0,2,10.0,12.0,1.01,0.84,0.95,2,2


In [12]:
processed_folder = Path(project_folder) / "data" / "processed"
processed_folder.mkdir(parents=True, exist_ok=True)

df = df_raw.replace({".": np.nan, " .": np.nan, ". ": np.nan})

for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors="ignore")

out_path = processed_folder / "public_clean.csv"
df.to_csv(out_path, index=False)

print(" Saved cleaned file:", out_path)
df.head()

✅ Saved cleaned file: /content/replication_project/data/processed/public_clean.csv


  df[c] = pd.to_numeric(df[c], errors="ignore")


Unnamed: 0,SHEET,CHAIN,CO_OWNED,STATE,SOUTHJ,CENTRALJ,NORTHJ,PA1,PA2,SHORE,...,FIRSTIN2,SPECIAL2,MEALS2,OPEN2R,HRSOPEN2,PSODA2,PFRY2,PENTREE2,NREGS2,NREGS112
0,46,1,0,0,0,0,0,1,0,0,...,0.08,1.0,2.0,6.5,16.5,1.03,,0.94,4.0,4.0
1,49,2,0,0,0,0,0,1,0,0,...,0.05,0.0,2.0,10.0,13.0,1.01,0.89,2.35,4.0,4.0
2,506,2,1,0,0,0,0,1,0,0,...,0.25,,1.0,11.0,11.0,0.95,0.74,2.33,4.0,3.0
3,56,4,1,0,0,0,0,1,0,0,...,0.15,0.0,2.0,10.0,12.0,0.92,0.79,0.87,2.0,2.0
4,61,4,1,0,0,0,0,1,0,0,...,0.15,0.0,2.0,10.0,12.0,1.01,0.84,0.95,2.0,2.0


In [13]:
import os
print(sorted(os.listdir("/content/replication_project/data/raw")))

['check.sas', 'codebook', 'njmin.zip', 'public.dat', 'read.me', 'survey1.nj', 'survey2.nj']


In [15]:
import os
from pathlib import Path
import shutil

ROOT = Path("/content/replication_project")

(ROOT / "notebooks").mkdir(parents=True, exist_ok=True)
(ROOT / "outputs" / "figures").mkdir(parents=True, exist_ok=True)
(ROOT / "outputs" / "tables").mkdir(parents=True, exist_ok=True)
print(" notebooks folder created:", ROOT / "notebooks")

gitignore_text = """# Python
__pycache__/
*.pyc
.ipynb_checkpoints/
.pytest_cache/
.cache/
*.log

.DS_Store
Thumbs.db


outputs/

data/raw/
data/processed/
"""

gitignore_path = ROOT / ".gitignore"
gitignore_path.write_text(gitignore_text, encoding="utf-8")
print(" Generated .gitignore:", gitignore_path)

# 3) Zip AFTER writing .gitignore
zip_out = shutil.make_archive(str(ROOT), "zip", str(ROOT))
print(" Packaged:", zip_out)

 notebooks folder created: /content/replication_project/notebooks
 Generated .gitignore: /content/replication_project/.gitignore
 Packaged: /content/replication_project.zip
