In [36]:
import pandas as pd

def create_region_mapping():
    """Create dictionary mapping states to their regions"""
    return {
        # [Your existing region mapping dictionary stays the same]
        # Northeast/Eastern Region
        'Maine': 'eastern',
        'New Hampshire': 'eastern',
        'Vermont': 'eastern',
        'Massachusetts': 'eastern',
        'Rhode Island': 'eastern',
        'Connecticut': 'eastern',
        'New York': 'eastern',
        'Pennsylvania': 'eastern',
        'New Jersey': 'eastern',
        'Delaware': 'eastern',
        'Maryland': 'eastern',
        'District of Columbia': 'eastern',
        
        # Southern Region
        'Virginia': 'southern',
        'West Virginia': 'southern',
        'Kentucky': 'southern',
        'Tennessee': 'southern',
        'North Carolina': 'southern',
        'South Carolina': 'southern',
        'Georgia': 'southern',
        'Florida': 'southern',
        'Alabama': 'southern',
        'Mississippi': 'southern',
        'Louisiana': 'southern',
        'Arkansas': 'southern',
        
        # Central Region
        'Ohio': 'central',
        'Indiana': 'central',
        'Illinois': 'central',
        'Michigan': 'central',
        'Wisconsin': 'central',
        'Minnesota': 'central',
        'Iowa': 'central',
        'Missouri': 'central',
        'North Dakota': 'central',
        'South Dakota': 'central',
        'Nebraska': 'central',
        'Kansas': 'central',
        
        # Western Region
        'Montana': 'western',
        'Idaho': 'western',
        'Wyoming': 'western',
        'Colorado': 'western',
        'New Mexico': 'western',
        'Arizona': 'western',
        'Utah': 'western',
        'Nevada': 'western',
        'California': 'western',
        'Oregon': 'western',
        'Washington': 'western',
        'Alaska': 'western',
        'Hawaii': 'western'
    }

def clean_lyme_data(df):
    # Create copy of dataframe
    df_clean = df.copy()
    
    # Clean state names by removing unusual characters
    df_clean['State'] = df_clean['State'].str.replace('[^a-zA-Z\s]', '', regex=True)
    
    # Melt the dataframe to convert years to rows
    df_melted = pd.melt(
        df_clean,
        id_vars=['State'],
        var_name='Year',
        value_name='Lyme_cases'
    )
    
    # Create region mapping
    region_mapping = create_region_mapping()
    
    # Add region column
    df_melted['region'] = df_melted['State'].map(region_mapping)
    
    # Convert Year to integer
    df_melted['Year'] = pd.to_numeric(df_melted['Year'])
    
    # Sort by region, state, and year
    df_melted = df_melted.sort_values(['region', 'State', 'Year'])

    df_melted['Lyme_cases'] = df_melted['Lyme_cases'].str.replace(',', '').astype(int)
    
    return df_melted

# Initialize df as None before the loop
df = None

# Try different encodings until one works
encodings_to_try = ['latin1', 'cp1252', 'iso-8859-1', 'utf-8']
for encoding in encodings_to_try:
    try:
        print(f"Trying {encoding} encoding...")
        df = pd.read_csv('../data/raw_data/lyme_states_2008-2022_WIDE.csv', encoding=encoding)
        print(f"Successfully read file with {encoding} encoding")
        break
    except UnicodeDecodeError:
        print(f"Failed with {encoding} encoding")
        continue
    except Exception as e:
        print(f"Different error with {encoding} encoding: {str(e)}")
        continue

# Check if we successfully loaded the data
if df is None:
    raise Exception("Could not read the CSV file with any of the attempted encodings")

# Clean the data
state_lyme = clean_lyme_data(df)

# Display first few rows and basic information
print("\nFirst few rows of the cleaned dataset:")
print(state_lyme.head())

# Print unique state names to verify cleaning worked
print("\nUnique state names after cleaning:")
print(sorted(state_lyme['State'].unique()))

print("\nSummary of data by region:")
print(state_lyme.groupby('region')['State'].nunique().to_frame('Number of States'))

  df_clean['State'] = df_clean['State'].str.replace('[^a-zA-Z\s]', '', regex=True)


Trying latin1 encoding...
Successfully read file with latin1 encoding

First few rows of the cleaned dataset:
        State  Year  Lyme_cases   region
13   Illinois  2008         108  central
65   Illinois  2009         136  central
117  Illinois  2010         135  central
169  Illinois  2011         194  central
221  Illinois  2012         204  central

Unique state names after cleaning:
['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'US Total', 'Utah', 'Vermont', 'Virginia', 'Was

In [37]:
import pandas as pd
import glob
import os

def clean_avgtemp_data():
    
    # List to store dataframes
    avg_dfs = []
    
    # find all avgtemp data for each region
    avgtemp_files = glob.glob("../data/raw_data/*avgtemp.csv")  
    for file in avgtemp_files:
        print(file)
    
    # Process each avgtemp file for the regions
    for file_path in avgtemp_files:
        print(f"\nProcessing {file_path}")
        # Extract region from filename
        region = os.path.basename(file_path).split('_')[0]
        print(f"Extracted region: {region}")
        
        try:
            # Read CSV file, skipping the first 4 rows
            df = pd.read_csv(file_path, skiprows=4)
            print(f"Read {len(df)} rows from file")
            
            # Clean and rename columns
            df = df[['Date', 'Value']]  # Keep only needed columns
            df = df.rename(columns={
                'Date': 'Year',
                'Value': 'Avg_temp'
            })
            
            # Extract year from the date column
            df['Year'] = df['Year'].astype(str).str[:4].astype(int)
            
            # Add region column
            df['region'] = region
            
            avg_dfs.append(df)
            print(f"Successfully processed {file_path}")
            
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
    
    # Combine all regions
    if avg_dfs:
        return pd.concat(avg_dfs, ignore_index=True)
    return None

def clean_mintemp_data():
    # List to store dataframes
    min_dfs = []
    
    # Show which files we're finding
    print("\nFound these mintemp files:")
    mintemp_files = glob.glob("../data/raw_data/*mintemp.csv")  # Removed underscore from pattern
    for file in mintemp_files:
        print(file)
    
    # Process each mintemp file for the regions
    for file_path in mintemp_files:
        print(f"\nProcessing {file_path}")
        # Extract region from filename
        region = os.path.basename(file_path).split('_')[0]
        print(f"Extracted region: {region}")
        
        try:
            # Read CSV file, skipping the first 4 rows
            df = pd.read_csv(file_path, skiprows=4)
            print(f"Read {len(df)} rows from file")
            
            # Clean and rename columns
            df = df[['Date', 'Value']]  # Keep only needed columns
            df = df.rename(columns={
                'Date': 'Year',
                'Value': 'Min_temp_avg'
            })
            
            # Extract year from the date column
            df['Year'] = df['Year'].astype(str).str[:4].astype(int)
            
            # Add region column
            df['region'] = region
            
            min_dfs.append(df)
            print(f"Successfully processed {file_path}")
            
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
    
    # Combine all regions
    if min_dfs:
        return pd.concat(min_dfs, ignore_index=True)
    return None


# PRECIPITATION DATA
def clean_precipitation_data():
    # List to store dataframes
    min_dfs = []
    
    # Show which files we're finding
    precipitation_files = glob.glob("../data/raw_data/*precipitation.csv")  # Removed underscore from pattern
    for file in precipitation_files:
        print(file)
    
    # Process each mintemp file for the regions
    for file_path in precipitation_files:
        print(f"\nProcessing {file_path}")
        # Extract region from filename
        region = os.path.basename(file_path).split('_')[0]
        print(f"Extracted region: {region}")
        
        try:
            # Read CSV file, skipping the first 4 rows
            df = pd.read_csv(file_path, skiprows=4)
            print(f"Read {len(df)} rows from file")
            
            # Clean and rename columns
            df = df[['Date', 'Value']]  # Keep only needed columns
            df = df.rename(columns={
                'Date': 'Year',
                'Value': 'Precipitation_avg'
            })
            
            # Extract year from the date column
            df['Year'] = df['Year'].astype(str).str[:4].astype(int)
            
            # Add region column
            df['region'] = region
            
            min_dfs.append(df)
            print(f"Successfully processed {file_path}")
            
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
    
    # Combine all regions
    if min_dfs:
        return pd.concat(min_dfs, ignore_index=True)
    return None

# Process both types of data
print("Processing average temperature data...")
avg_temp_data = clean_avgtemp_data()
print("\nProcessing minimum temperature data...")
min_temp_data = clean_mintemp_data()
print("\nProcessing processing data...")
precipitation_data = clean_precipitation_data()




# # Merge average and minimum temperature data
# if avg_temp_data is not None and min_temp_data is not None:
#     final_data = min_temp_data.merge(avg_temp_data, on=['year', 'region'], how='left')
    
#     # Sort by region and year
#     final_data = final_data.sort_values(['region', 'year'])
    
#     # Save the merged dataset
#     final_data.to_csv("merged_temperature_data.csv", index=False)
    
#     print("\nData processing completed successfully!")
#     print("\nFirst few rows of the merged dataset:")
#     print(final_data.head())
# else:
#     print("Error: No data was processed. Please check if the input files exist.")

Processing average temperature data...


../data/raw_data/western_avgtemp.csv
../data/raw_data/central_avgtemp.csv
../data/raw_data/eastern_avgtemp.csv
../data/raw_data/southern_avgtemp.csv

Processing ../data/raw_data/western_avgtemp.csv
Extracted region: western
Read 21 rows from file
Successfully processed ../data/raw_data/western_avgtemp.csv

Processing ../data/raw_data/central_avgtemp.csv
Extracted region: central
Read 21 rows from file
Successfully processed ../data/raw_data/central_avgtemp.csv

Processing ../data/raw_data/eastern_avgtemp.csv
Extracted region: eastern
Read 21 rows from file
Successfully processed ../data/raw_data/eastern_avgtemp.csv

Processing ../data/raw_data/southern_avgtemp.csv
Extracted region: southern
Read 21 rows from file
Successfully processed ../data/raw_data/southern_avgtemp.csv

Processing minimum temperature data...

Found these mintemp files:
../data/raw_data/western_mintemp.csv
../data/raw_data/southern_mintemp.csv
../data/raw_data/eastern_mintemp.csv
../data/raw_data/central_mintemp.cs

In [38]:
merged = min_temp_data.merge(avg_temp_data, on=['Year', 'region'], how='left')
merged2 = precipitation_data.merge(merged, on=['Year', 'region'], how='outer')
merged3 = state_lyme.merge(merged2, on=['Year', 'region'], how='right')
print(merged3.head())

      State  Year  Lyme_cases   region  Precipitation_avg  Min_temp_avg  \
0  Illinois  2008         108  central               4.49          53.8   
1   Indiana  2008          42  central               4.49          53.8   
2      Iowa  2008         109  central               4.49          53.8   
3    Kansas  2008          16  central               4.49          53.8   
4  Michigan  2008          92  central               4.49          53.8   

   Avg_temp  
0      66.1  
1      66.1  
2      66.1  
3      66.1  
4      66.1  


In [39]:
merged3.to_csv('../data/clean_data/state_data.csv', index=False)