In [1]:
import pandas as pd

def create_region_mapping():
    """Create dictionary mapping states to their regions"""
    return {
        # [Your existing region mapping dictionary stays the same]
        # Northeast/Eastern Region
        'Maine': 'eastern',
        'New Hampshire': 'eastern',
        'Vermont': 'eastern',
        'Massachusetts': 'eastern',
        'Rhode Island': 'eastern',
        'Connecticut': 'eastern',
        'New York': 'eastern',
        'Pennsylvania': 'eastern',
        'New Jersey': 'eastern',
        'Delaware': 'eastern',
        'Maryland': 'eastern',
        'District of Columbia': 'eastern',
        
        # Southern Region
        'Virginia': 'southern',
        'West Virginia': 'southern',
        'Kentucky': 'southern',
        'Tennessee': 'southern',
        'North Carolina': 'southern',
        'South Carolina': 'southern',
        'Georgia': 'southern',
        'Florida': 'southern',
        'Alabama': 'southern',
        'Mississippi': 'southern',
        'Louisiana': 'southern',
        'Arkansas': 'southern',
        
        # Central Region
        'Ohio': 'central',
        'Indiana': 'central',
        'Illinois': 'central',
        'Michigan': 'central',
        'Wisconsin': 'central',
        'Minnesota': 'central',
        'Iowa': 'central',
        'Missouri': 'central',
        'North Dakota': 'central',
        'South Dakota': 'central',
        'Nebraska': 'central',
        'Kansas': 'central',
        
        # Western Region
        'Montana': 'western',
        'Idaho': 'western',
        'Wyoming': 'western',
        'Colorado': 'western',
        'New Mexico': 'western',
        'Arizona': 'western',
        'Utah': 'western',
        'Nevada': 'western',
        'California': 'western',
        'Oregon': 'western',
        'Washington': 'western',
        'Alaska': 'western',
        'Hawaii': 'western'
    }

def clean_lyme_data(df):
    # Create copy of dataframe
    df_clean = df.copy()
    
    # Clean state names by removing unusual characters
    df_clean['State'] = df_clean['State'].str.replace('[^a-zA-Z\s]', '', regex=True)
    
    # Melt the dataframe to convert years to rows
    df_melted = pd.melt(
        df_clean,
        id_vars=['State'],
        var_name='Year',
        value_name='Lyme_cases'
    )
    
    # Create region mapping
    region_mapping = create_region_mapping()
    
    # Add region column
    df_melted['region'] = df_melted['State'].map(region_mapping)
    
    # Convert Year to integer
    df_melted['Year'] = pd.to_numeric(df_melted['Year'])
    
    # Sort by region, state, and year
    df_melted = df_melted.sort_values(['region', 'State', 'Year'])

    df_melted['Lyme_cases'] = df_melted['Lyme_cases'].str.replace(',', '').astype(int)
    
    return df_melted

# Initialize df as None before the loop
df = None

# Try different encodings until one works
encodings_to_try = ['latin1', 'cp1252', 'iso-8859-1', 'utf-8']
for encoding in encodings_to_try:
    try:
        print(f"Trying {encoding} encoding...")
        df = pd.read_csv('../data/raw_data/lyme_states_2008-2022_WIDE.csv', encoding=encoding)
        print(f"Successfully read file with {encoding} encoding")
        break
    except UnicodeDecodeError:
        print(f"Failed with {encoding} encoding")
        continue
    except Exception as e:
        print(f"Different error with {encoding} encoding: {str(e)}")
        continue

# Check if we successfully loaded the data
if df is None:
    raise Exception("Could not read the CSV file with any of the attempted encodings")

# Clean the data
state_lyme = clean_lyme_data(df)

# Display first few rows and basic information
print("\nFirst few rows of the cleaned dataset:")
print(state_lyme.head())

# Print unique state names to verify cleaning worked
print("\nUnique state names after cleaning:")
print(sorted(state_lyme['State'].unique()))

print("\nSummary of data by region:")
print(state_lyme.groupby('region')['State'].nunique().to_frame('Number of States'))

  df_clean['State'] = df_clean['State'].str.replace('[^a-zA-Z\s]', '', regex=True)


Trying latin1 encoding...
Successfully read file with latin1 encoding

First few rows of the cleaned dataset:
        State  Year  Lyme_cases   region
13   Illinois  2008         108  central
65   Illinois  2009         136  central
117  Illinois  2010         135  central
169  Illinois  2011         194  central
221  Illinois  2012         204  central

Unique state names after cleaning:
['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky', 'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana', 'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota', 'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Rhode Island', 'South Carolina', 'South Dakota', 'Tennessee', 'Texas', 'US Total', 'Utah', 'Vermont', 'Virginia', 'Was

## Climate data

In [2]:
import pandas as pd
import glob
import os

def clean_avgtemp_data():
    
    # List to store dataframes
    avg_dfs = []
    
    # find all avgtemp data for each region
    avgtemp_files = glob.glob("../data/raw_data/*avgtemp.csv")  
    for file in avgtemp_files:
        print(file)
    
    # Process each avgtemp file for the regions
    for file_path in avgtemp_files:
        print(f"\nProcessing {file_path}")
        # Extract region from filename
        region = os.path.basename(file_path).split('_')[0]
        print(f"Extracted region: {region}")
        
        try:
            # Read CSV file, skipping the first 4 rows
            df = pd.read_csv(file_path, skiprows=4)
            print(f"Read {len(df)} rows from file")
            
            # Clean and rename columns
            df = df[['Date', 'Value']]  # Keep only needed columns
            df = df.rename(columns={
                'Date': 'Year',
                'Value': 'Avg_temp'
            })
            
            # Extract year from the date column
            df['Year'] = df['Year'].astype(str).str[:4].astype(int)
            
            # Add region column
            df['region'] = region
            
            avg_dfs.append(df)
            print(f"Successfully processed {file_path}")
            
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
    
    # Combine all regions
    if avg_dfs:
        return pd.concat(avg_dfs, ignore_index=True)
    return None

def clean_mintemp_data():
    # List to store dataframes
    min_dfs = []
    
    # Show which files we're finding
    print("\nFound these mintemp files:")
    mintemp_files = glob.glob("../data/raw_data/*mintemp.csv")  # Removed underscore from pattern
    for file in mintemp_files:
        print(file)
    
    # Process each mintemp file for the regions
    for file_path in mintemp_files:
        print(f"\nProcessing {file_path}")
        # Extract region from filename
        region = os.path.basename(file_path).split('_')[0]
        print(f"Extracted region: {region}")
        
        try:
            # Read CSV file, skipping the first 4 rows
            df = pd.read_csv(file_path, skiprows=4)
            print(f"Read {len(df)} rows from file")
            
            # Clean and rename columns
            df = df[['Date', 'Value']]  # Keep only needed columns
            df = df.rename(columns={
                'Date': 'Year',
                'Value': 'Min_temp_avg'
            })
            
            # Extract year from the date column
            df['Year'] = df['Year'].astype(str).str[:4].astype(int)
            
            # Add region column
            df['region'] = region
            
            min_dfs.append(df)
            print(f"Successfully processed {file_path}")
            
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
    
    # Combine all regions
    if min_dfs:
        return pd.concat(min_dfs, ignore_index=True)
    return None


# PRECIPITATION DATA
def clean_precipitation_data():
    # List to store dataframes
    min_dfs = []
    
    # Show which files we're finding
    precipitation_files = glob.glob("../data/raw_data/*precipitation.csv")  # Removed underscore from pattern
    for file in precipitation_files:
        print(file)
    
    # Process each mintemp file for the regions
    for file_path in precipitation_files:
        print(f"\nProcessing {file_path}")
        # Extract region from filename
        region = os.path.basename(file_path).split('_')[0]
        print(f"Extracted region: {region}")
        
        try:
            # Read CSV file, skipping the first 4 rows
            df = pd.read_csv(file_path, skiprows=4)
            print(f"Read {len(df)} rows from file")
            
            # Clean and rename columns
            df = df[['Date', 'Value']]  # Keep only needed columns
            df = df.rename(columns={
                'Date': 'Year',
                'Value': 'Precipitation_avg'
            })
            
            # Extract year from the date column
            df['Year'] = df['Year'].astype(str).str[:4].astype(int)
            
            # Add region column
            df['region'] = region
            
            min_dfs.append(df)
            print(f"Successfully processed {file_path}")
            
        except Exception as e:
            print(f"Error processing {file_path}: {str(e)}")
    
    # Combine all regions
    if min_dfs:
        return pd.concat(min_dfs, ignore_index=True)
    return None

# Process both types of data
print("Processing average temperature data...")
avg_temp_data = clean_avgtemp_data()
print("\nProcessing minimum temperature data...")
min_temp_data = clean_mintemp_data()
print("\nProcessing processing data...")
precipitation_data = clean_precipitation_data()




# # Merge average and minimum temperature data
# if avg_temp_data is not None and min_temp_data is not None:
#     final_data = min_temp_data.merge(avg_temp_data, on=['year', 'region'], how='left')
    
#     # Sort by region and year
#     final_data = final_data.sort_values(['region', 'year'])
    
#     # Save the merged dataset
#     final_data.to_csv("merged_temperature_data.csv", index=False)
    
#     print("\nData processing completed successfully!")
#     print("\nFirst few rows of the merged dataset:")
#     print(final_data.head())
# else:
#     print("Error: No data was processed. Please check if the input files exist.")

Processing average temperature data...
../data/raw_data/western_avgtemp.csv
../data/raw_data/central_avgtemp.csv
../data/raw_data/eastern_avgtemp.csv
../data/raw_data/southern_avgtemp.csv

Processing ../data/raw_data/western_avgtemp.csv
Extracted region: western
Read 21 rows from file
Successfully processed ../data/raw_data/western_avgtemp.csv

Processing ../data/raw_data/central_avgtemp.csv
Extracted region: central
Read 21 rows from file
Successfully processed ../data/raw_data/central_avgtemp.csv

Processing ../data/raw_data/eastern_avgtemp.csv
Extracted region: eastern
Read 21 rows from file
Successfully processed ../data/raw_data/eastern_avgtemp.csv

Processing ../data/raw_data/southern_avgtemp.csv
Extracted region: southern
Read 21 rows from file
Successfully processed ../data/raw_data/southern_avgtemp.csv

Processing minimum temperature data...

Found these mintemp files:
../data/raw_data/western_mintemp.csv
../data/raw_data/southern_mintemp.csv
../data/raw_data/eastern_mintemp.c

## Tree coverage loss data

In [3]:
tcloss = pd.read_csv('../data/raw_data/treecoverlossdata.csv')

# 1. Filter for threshold = 75 and remove the threshold column
filtered_data = tcloss[tcloss['threshold'] == 75]

# 2. Remove unnecessary columns
columns_to_remove = ['country', 'extent_2000_ha', 'extent_2010_ha', 'gain_2000-2020_ha', 'threshold']
filtered_data = filtered_data.drop(columns=columns_to_remove)

# 3. Melt on tc_loss_ha_20** columns
melted_tc = pd.melt(
    filtered_data,
    id_vars=['subnational1', 'area_ha'],
    value_vars=[col for col in filtered_data.columns if col.startswith('tc_loss_ha_')],
    var_name='Year',
    value_name='Tree_Cover_Loss'
)

# 4. Clean the 'Year' column to retain only the numeric year
melted_tc['Year'] = melted_tc['Year'].str.extract(r'(\d{4})')
melted_tc['Year'] = pd.to_numeric(melted_tc['Year'])

# 5. Rename columns
melted_tc = melted_tc.rename(columns={'subnational1': 'State', 'area_ha': 'Total_Land_Area'})

# Display the cleaned data
print(melted_tc.head())  # Replace with tools to display if needed



        State  Total_Land_Area  Year  Tree_Cover_Loss
0     Alabama         13363464  2001           168587
1      Alaska        150737804  2001            27964
2     Arizona         29535713  2001              653
3    Arkansas         13769059  2001           110114
4  California         40961694  2001            39102


## Species richness data

In [13]:
species = pd.read_csv('../data/raw_data/species_richness_by_state.csv')
species = species.rename(columns={'state': 'State'})
species['state_park_land_coverage'] = species['state_park_land_coverage'].str.replace('%', '')
species['state_park_land_coverage'] = pd.to_numeric(species['state_park_land_coverage']) / 100
print(species.head())

        State  species_richness  state_park_land_coverage  state_park_rank
0     Alabama              10.0                    0.0023               46
1      Alaska               NaN                    0.0910                2
2     Arizona               5.0                    0.0260               12
3    Arkansas               8.0                    0.0018               44
4  California              10.0                    0.0749                3


### Merge all datasets

In [14]:
merged = min_temp_data.merge(avg_temp_data, on=['Year', 'region'], how='left')
merged2 = precipitation_data.merge(merged, on=['Year', 'region'], how='outer')
merged3 = merged2.merge(state_lyme, on=['Year', 'region'], how='left')
merged4 = merged3.merge(melted_tc, on=['Year', 'State'], how="left")
merged5 = merged4.merge(species, on='State', how='left')
print(merged5.head())

   Year  Precipitation_avg   region  Min_temp_avg  Avg_temp     State  \
0  2008               4.49  central          53.8      66.1  Illinois   
1  2008               4.49  central          53.8      66.1   Indiana   
2  2008               4.49  central          53.8      66.1      Iowa   
3  2008               4.49  central          53.8      66.1    Kansas   
4  2008               4.49  central          53.8      66.1  Michigan   

   Lyme_cases  Total_Land_Area  Tree_Cover_Loss  species_richness  \
0         108         15008781              615               6.0   
1          42          9436269             1051               8.0   
2         109         14584483              209               5.0   
3          16         21312413              460               1.0   
4          92         25036039            27919               3.0   

   state_park_land_coverage  state_park_rank  
0                    0.0139             19.0  
1                    0.0081             32.0  
2    

In [12]:
merged5.dtypes

Year                          int64
Precipitation_avg           float64
region                       object
Min_temp_avg                float64
Avg_temp                    float64
State                        object
Lyme_cases                    int64
Total_Land_Area               int64
Tree_Cover_Loss               int64
species_richness            float64
state_park_land_coverage    float64
state_park_rank             float64
dtype: object

In [34]:
merged5.to_csv('../data/clean_data/data.csv', index=False)

## Cleaning RCP 8.5 future climate scenarios data

In [None]:
# import pandas as pd
# import os

# def process_climate_file(file_path):
#     # Define mapping for filename to region and number of rows to skip
#     file_configs = {
#         'westernUS': {'region': 'western', 'skiprows': 13},
#         'mississippi-2050': {'region': 'southern', 'skiprows': 11},
#         'northernUS': {'region': 'northern', 'skiprows': 13},
#         'iowa': {'region': 'central', 'skiprows': 11}  # Fixed filename for iowa
#     }
    
#     # Extract filename base (handle both iowa and iowa_central cases)
#     file_name = os.path.basename(file_path).split('_')[0]
    
#     # Get configuration for this file
#     config = file_configs.get(file_name)
#     if not config:
#         print(f"Unknown file format: {file_name}")
#         return None
    
#     try:
#         # Read CSV file with appropriate number of rows to skip and tab separator
#         df = pd.read_csv(file_path, skiprows=config['skiprows'] - 1, sep='\t')
        
#         # Extract the specific values from Scenario 4 column
#         climate_data = {
#             'Indicator': ['Avg_temp', 'Min_temp', 'Precipitation_avg'],
#             'value': [
#                 float(df.iloc[1]['Scenario 4'].strip()),    # Summer Mean Temperature
#                 float(df.iloc[3]['Scenario 4'].strip()),    # Summer Minimum Temperature
#                 float(df.iloc[5]['Scenario 4'].strip())     # Summer Precipitation
#             ],
#             'region': [config['region']] * 3  # Use the region from config
#         }
        
#         # Create DataFrame from extracted data
#         result_df = pd.DataFrame(climate_data)
        
#         # Print debug information
#         print(f"\nProcessing {file_name}:")
#         print(f"Avg_temp: {climate_data['value'][0]}")
#         print(f"Min_temp: {climate_data['value'][1]}")
#         print(f"Precipitation_avg: {climate_data['value'][2]}")
        
#         return result_df
        
#     except Exception as e:
#         print(f"\nError processing file {file_path}:")
#         print(f"Error message: {str(e)}")
#         print(f"File: {file_name}")
#         if 'df' in locals():
#             print(f"DataFrame shape: {df.shape}")
#             print(f"DataFrame columns: {df.columns.tolist()}")
#             print(f"First few rows of data:")
#             print(df.head())
#         return None

# def process_all_climate_files(directory_path):
#     all_dfs = []
    
#     # Find all relevant CSV files
#     for file in os.listdir(directory_path):
#         if file.endswith('rcp8.5.csv'):
#             file_path = os.path.join(directory_path, file)
#             processed_df = process_climate_file(file_path)
#             if processed_df is not None:
#                 all_dfs.append(processed_df)
    
#     # Combine all DataFrames if we have any
#     if all_dfs:
#         final_df = pd.concat(all_dfs, ignore_index=True)
        
#         # Ensure numeric values in 'value' column
#         final_df['value'] = pd.to_numeric(final_df['value'], errors='coerce')
        
#         return final_df
#     else:
#         return pd.DataFrame()

# # Example usage:
# if __name__ == "__main__":
#     directory_path = "../data/raw_data"  # Your directory path
    
#     # Process all files
#     combined_climate_data = process_all_climate_files(directory_path)
    
#     if not combined_climate_data.empty:
#         print("\nFinal Combined Climate Data:")
#         print(combined_climate_data)
        
#         # Save to CSV
#         output_path = "processed_climate_data.csv"
#         combined_climate_data.to_csv(output_path, index=False)
#         print(f"\nData saved to {output_path}")
#     else:
#         print("\nNo data was successfully processed.")


Error processing file ../data/raw_data/iowa_central_rcp8.5.csv:
Error message: 'Scenario 4'
File: iowa
DataFrame shape: (9, 1)
DataFrame columns: ['********************************************************']
First few rows of data:
  ********************************************************
0  Climate Metric, Scenario 1, Scenario 2, Scenar...      
1  Summer Mean Temperature(°F),73.602249,77.45282...      
2  (change relative to historical by °F),1.06,4.9...      
3  Summer Minimum Temperature(°F),62.734554,65.90...      
4  (change relative to historical by °F),1.26,4.4...      

Error processing file ../data/raw_data/mississippi-2050_rcp8.5.csv:
Error message: 'Scenario 4'
File: mississippi-2050
DataFrame shape: (9, 1)
DataFrame columns: ['********************************************************']
First few rows of data:
  ********************************************************
0  Climate Metric, Scenario 1, Scenario 2, Scenar...      
1  Summer Mean Temperature(°F),84.072647,84.7415

In [37]:
rcp = pd.read_csv('../data/clean_data/RCP8.5_data.csv')

region_aggregates = merged5.groupby('region').agg({
    'Total_Land_Area': 'sum',  # Sum for total land area
    'state_park_land_coverage': 'mean',  # Average for park land coverage
    'state_park_rank': 'mean'  # Average for park rank
}).reset_index()

# Merge region-level climate data with the aggregated state-level data
# Assuming `climate_df` contains your regional climate data for future years
# Example: {'region': ['central', 'eastern', ...], 'Precipitation_avg': [value1, value2, ...]}
future_data = pd.merge(rcp, region_aggregates, on='region')

print(future_data)

future_data.to_csv('../data/clean_data/2050inputs.csv')

            Indicator  value    region  Total_Land_Area  \
0            Avg_temp  86.36  southern       2105379510   
1            Min_temp  75.29  southern       2105379510   
2   Precipitation_avg  12.58  southern       2105379510   
3            Avg_temp  72.39   eastern        710057100   
4            Min_temp  61.21   eastern        710057100   
5   Precipitation_avg  12.76   eastern        710057100   
6            Avg_temp  79.33   central       3198039870   
7            Min_temp  67.81   central       3198039870   
8   Precipitation_avg  12.41   central       3198039870   
9            Avg_temp  74.49   western       6895077150   
10           Min_temp  59.00   western       6895077150   
11  Precipitation_avg   7.50   western       6895077150   

    state_park_land_coverage  state_park_rank  
0                   0.012292        30.916667  
1                   0.012292        30.916667  
2                   0.012292        30.916667  
3                   0.027827        17.3