In [2]:
import pandas as pd

In [3]:
lyme = pd.read_csv("../data/raw_data/lyme_fig-1.csv")
avg_temp = pd.read_csv("../data/raw_data/avg_temp_june_1990-2022.csv")
 # max_temp = pd.read_csv("../data/raw_data/max_temp_june_1990-2022.csv")
min_temp = pd.read_csv("../data/raw_data/min_temp_june_1990-2022.csv")
precipitation = pd.read_csv("../data/raw_data/precipitation_june_1990-2022.csv")
cooling_degree = pd.read_csv("../data/raw_data/cooling_degree_days_june_1990-2022.csv")
pop = pd.read_csv("../data/raw_data/usa_pop_1991-2022_2042.csv")

In [4]:
# AVG TEMP cleaning

# remove the last two digits from the "Date" column ; rename it to "Year"
avg_temp['Year'] = avg_temp['Date'].astype(str).str[:-2]

# rename columns
avg_temp = avg_temp.rename(columns={
    'Value': 'Avg_temp_june_value',
    'Anomaly': 'Avg_temp_june_anomaly'
})

# drop 'Date' column
avg_temp = avg_temp.drop(columns=['Date'])

# reorder 
avg_temp = avg_temp[['Year', 'Avg_temp_june_value', 'Avg_temp_june_anomaly']]

print(avg_temp.head())


   Year  Avg_temp_june_value  Avg_temp_june_anomaly
0  1990                70.21                   1.74
1  1991                69.04                   0.57
2  1992                67.41                  -1.06
3  1993                67.19                  -1.28
4  1994                70.93                   2.46


In [5]:
# MIN TEMP cleaning

# remove the last two digits from the "Date" column ; rename it to "Year"
min_temp['Year'] = min_temp['Date'].astype(str).str[:-2]

# rename columns
min_temp = min_temp.rename(columns={
    'Value': 'Min_temp_june_value',
    'Anomaly': 'Min_temp_june_anomaly'
})

# drop 'Date' column
min_temp = min_temp.drop(columns=['Date'])

# reorder 
min_temp = min_temp[['Year', 'Min_temp_june_value', 'Min_temp_june_anomaly']]

print(min_temp.head())

   Year  Min_temp_june_value  Min_temp_june_anomaly
0  1990                57.20                   1.59
1  1991                56.82                   1.21
2  1992                55.06                  -0.55
3  1993                55.13                  -0.48
4  1994                57.96                   2.35


In [6]:
# PRECIPITATION cleaning

# remove the first two digits from the "Date" column ; rename it to "Year"
precipitation['Year'] = precipitation['Date'].astype(str).str[:-2]

# rename columns
precipitation = precipitation.rename(columns={
    'Value': 'Precipitation_june_value',
    'Anomaly': 'Precipitation_june_anomaly'
})

# drop 'Date' column
precipitation = precipitation.drop(columns=['Date'])

# reorder 
precipitation = precipitation[['Year', 'Precipitation_june_value', 'Precipitation_june_anomaly']]

print(precipitation.head())

   Year  Precipitation_june_value  Precipitation_june_anomaly
0  1990                      2.50                       -0.42
1  1991                      2.84                       -0.08
2  1992                      3.39                        0.47
3  1993                      3.61                        0.69
4  1994                      2.90                       -0.02


In [7]:
# remove the first two digits from the "Date" column ; rename it to "Year"
cooling_degree['Year'] = cooling_degree['Date'].astype(str).str[:-2]

# rename columns
cooling_degree = cooling_degree.rename(columns={
    'Value': 'Avg_cooling_degree_days_june',
    'Anomaly': 'Avg_cooling_degree_days_june_anomaly'
})

# drop 'Date' column
cooling_degree = cooling_degree.drop(columns=['Date'])

# reorder 
cooling_degree = cooling_degree[['Year', 'Avg_cooling_degree_days_june', 'Avg_cooling_degree_days_june_anomaly']]

print(cooling_degree.head())

   Year  Avg_cooling_degree_days_june  Avg_cooling_degree_days_june_anomaly
0  1990                           237                                    26
1  1991                           232                                    21
2  1992                           175                                   -36
3  1993                           208                                    -3
4  1994                           264                                    53


In [8]:
# POPULATION cleaning 
pop = pd.DataFrame(pop)

# Melt the data: Columns -> Rows
pop_melted = pop.melt(id_vars=["Series Name", "Series Code"], 
                    var_name="Year", 
                    value_name="Population")

# clean the "Year" column by removing anything in square brackets
pop_melted["Year"] = pop_melted["Year"].str.extract(r"(\d{4})")

# keep only the cleaned "Year" and "Population" columns
pop_cleaned = pop_melted[["Year", "Population"]]

print(pop_cleaned)

    Year  Population
0   1991   252981000
1   1992   256514000
2   1993   259919000
3   1994   263126000
4   1995   266278000
5   1996   269394000
6   1997   272657000
7   1998   275854000
8   1999   279040000
9   2000   282162411
10  2001   284968955
11  2002   287625193
12  2003   290107933
13  2004   292805298
14  2005   295516599
15  2006   298379912
16  2007   301231207
17  2008   304093966
18  2009   306771529
19  2010   309327143
20  2011   311583481
21  2012   313877662
22  2013   316059947
23  2014   318386329
24  2015   320738994
25  2016   323071755
26  2017   325122128
27  2018   326838199
28  2019   328329953
29  2020   331526933
30  2021   332048977
31  2022   333271411
32  2042   361189504


In [9]:
print(lyme.head())

   Year  Incidence_per_10000
0  1991             3.743365
1  1992             3.862554
2  1993             3.176764
3  1994             4.956944
4  1995             4.393898


In [10]:
from functools import reduce

# Convert the 'Year' column in all DataFrames to numeric (integer)
for df in [lyme, avg_temp, min_temp, precipitation, cooling_degree, pop_cleaned]:
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')  # Convert to numeric, coercing errors if needed

# Merge all DataFrames on the 'Year' column
dataframes = [lyme, avg_temp, min_temp, precipitation, cooling_degree, pop_cleaned]
merged_df = reduce(lambda left, right: left.merge(right, on='Year', how='inner'), dataframes)

# Display the merged DataFrame
print(merged_df.head())

   Year  Incidence_per_10000  Avg_temp_june_value  Avg_temp_june_anomaly  \
0  1991             3.743365                69.04                   0.57   
1  1992             3.862554                67.41                  -1.06   
2  1993             3.176764                67.19                  -1.28   
3  1994             4.956944                70.93                   2.46   
4  1995             4.393898                67.53                  -0.94   

   Min_temp_june_value  Min_temp_june_anomaly  Precipitation_june_value  \
0                56.82                   1.21                      2.84   
1                55.06                  -0.55                      3.39   
2                55.13                  -0.48                      3.61   
3                57.96                   2.35                      2.90   
4                55.24                  -0.37                      3.22   

   Precipitation_june_anomaly  Avg_cooling_degree_days_june  \
0                       -0.08

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Year'] = pd.to_numeric(df['Year'], errors='coerce')  # Convert to numeric, coercing errors if needed


In [11]:
# creating NEW column of TOTAL LYME DISEASE COUNTS per year 

merged_df['total_lyme_disease_counts'] = (merged_df['Incidence_per_10000'] / 10000) * merged_df['Population']

print(merged_df.head())


   Year  Incidence_per_10000  Avg_temp_june_value  Avg_temp_june_anomaly  \
0  1991             3.743365                69.04                   0.57   
1  1992             3.862554                67.41                  -1.06   
2  1993             3.176764                67.19                  -1.28   
3  1994             4.956944                70.93                   2.46   
4  1995             4.393898                67.53                  -0.94   

   Min_temp_june_value  Min_temp_june_anomaly  Precipitation_june_value  \
0                56.82                   1.21                      2.84   
1                55.06                  -0.55                      3.39   
2                55.13                  -0.48                      3.61   
3                57.96                   2.35                      2.90   
4                55.24                  -0.37                      3.22   

   Precipitation_june_anomaly  Avg_cooling_degree_days_june  \
0                       -0.08

In [12]:
merged_df.to_csv("../data/clean_data/merged_df.csv", index=False)