In [4]:
# %%
import pandas as pd

# %%
data = pd.read_csv("D:\\OneDrive\\Desktop\\Work\\python\\RBCDS\\Challenge\\Second-half\\Project_4\\Weekly_Provisional_Counts_of_Deaths_by_State_and_Select_Causes__2020-2023_20240410.csv")


# %%
print(data.dtypes)


# %%
data["Week Ending Date"] = pd.to_datetime(data["Week Ending Date"])


# %%
data['Year'] = data['Week Ending Date'].dt.year
data['Month'] = data['Week Ending Date'].dt.month
data['Day'] = data['Week Ending Date'].dt.day



# %%
data_agg = data.groupby(['Year', 'Month', 'Jurisdiction of Occurrence'])['All Cause'].sum().reset_index()


# %%
# Check for missing values
print(data_agg.isnull().sum())

# Handle missing values as needed (e.g., fill missing values or remove rows with missing values)


# %%
# Encode categorical variables using one-hot encoding
data_agg = pd.get_dummies(data_agg, columns=['Jurisdiction of Occurrence'])


# %%
# Scale features if necessary
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
data_agg[['Year', 'Month', 'All Cause']] = scaler.fit_transform(data_agg[['Year', 'Month', 'All Cause']])


# %%
from sklearn.model_selection import train_test_split

# Split the data into features (X) and target (y)
X = data_agg.drop('All Cause', axis=1)
y = data_agg['All Cause']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# %%
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

# Create a decision tree regression model
dt_regressor = DecisionTreeRegressor(random_state=42)

# Train the model on the training set
dt_regressor.fit(X_train, y_train)

# Evaluate the model's performance on the testing set
y_pred = dt_regressor.predict(X_test)

# Calculate evaluation metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)


# %%
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor

# Define the hyperparameter grid
param_grid = {
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

# Create a decision tree regression model
dt_regressor = DecisionTreeRegressor(random_state=42)

# Perform grid search
grid_search = GridSearchCV(dt_regressor, param_grid, cv=5, scoring='r2', return_train_score=True, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best hyperparameters and the corresponding mean cross-validated score
print('Best hyperparameters:', grid_search.best_params_)
print('Best mean cross-validated score:', grid_search.best_score_)


# %%
# Create a decision tree regression model with the best hyperparameters
dt_regressor_best = DecisionTreeRegressor(max_depth=None, max_features='sqrt', min_samples_leaf=2, min_samples_split=2, random_state=42)

# Train the model on the training data
dt_regressor_best.fit(X_train, y_train)

# Evaluate the model on the testing data
y_pred = dt_regressor_best.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the evaluation metrics
print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)

# %%
data

# %%
filtered_data = data[data["Jurisdiction of Occurrence"] == "Arkansas"]
filtered_data

# %%
# filtered_data = data_agg[data_agg["Jurisdiction of Occurrence_Alabama"] == 1]
# filtered_data

# %%
import pandas as pd

# Assuming the date column is named 'Date' and is in the format 'MM/DD/YYYY'
data['Date'] = pd.to_datetime(data['Data As Of'], format='%m/%d/%Y')

# Extract year, month, and day as separate columns
data['Year'] = data['Date'].dt.year
data['Month'] = data['Date'].dt.month
data['Day'] = data['Date'].dt.day

# Drop the original 'Date' column if it's no longer needed
data = data.drop('Date', axis=1)


# %%
filtered_data = filtered_data.drop('Data As Of', axis=1)
filtered_data = filtered_data.drop('Jurisdiction of Occurrence', axis=1)
filtered_data = filtered_data.drop('Week Ending Date', axis=1)
filtered_data.drop(filtered_data.columns[20:36], axis=1, inplace=True)



# %%
filtered_data = filtered_data.drop('flag_allcause', axis=1)
filtered_data = filtered_data.drop('flag_natcause', axis=1)
filtered_data = filtered_data.drop('flag_sept', axis=1)

# %%
filtered_data.to_csv("filtered_data.csv", index=False)

# %%
filtered_data

# %%
X = filtered_data.drop('Diabetes mellitus (E10-E14)', axis=1)
y = filtered_data['Diabetes mellitus (E10-E14)']

X = X.fillna(0)
y = y.fillna(0)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



# %%
from sklearn.tree import DecisionTreeRegressor

dt_regressor = DecisionTreeRegressor(random_state=42)
dt_regressor.fit(X_train, y_train)


# %%
y_pred = dt_regressor.predict(X_test)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import numpy as np

mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print('Mean Squared Error:', mse)
print('Root Mean Squared Error:', rmse)
print('Mean Absolute Error:', mae)
print('R-squared:', r2)


# %%
# Predict the target values for the test set (X_test)
y_pred = dt_regressor.predict(X_test)

# Print the predicted values
print("Predicted values:\n", y_pred)


# %%
import pandas as pd

# Create a new DataFrame with the actual and predicted values
predictions_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

# Print the first 10 rows of the predictions DataFrame
print("First 10 rows of predictions:\n", predictions_df.head(10))


# %%
predictions_df

# %%
#for loop, for each state, all diseases




Data As Of                                                                                            object
Jurisdiction of Occurrence                                                                            object
MMWR Year                                                                                              int64
MMWR Week                                                                                              int64
Week Ending Date                                                                                      object
All Cause                                                                                              int64
Natural Cause                                                                                          int64
Septicemia (A40-A41)                                                                                 float64
Malignant neoplasms (C00-C97)                                                                        float64
Diabetes mellitus (

Unnamed: 0,Actual,Predicted
914,15.0,21.0
792,16.0,24.0
931,27.0,31.0
872,25.0,28.0
844,23.0,23.0
929,26.0,31.0
831,31.0,33.0
791,11.0,21.0
888,27.0,36.0
887,32.0,37.0


In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Initialize an empty list to store the predictions DataFrames
predictions_dfs = []

# Get the unique jurisdictions
jurisdictions = data['Jurisdiction of Occurrence'].unique()

# Get the columns of interest
columns_of_interest = ['Septicemia (A40-A41)','Malignant neoplasms (C00-C97)','Diabetes mellitus (E10-E14)','Alzheimer disease (G30)','Influenza and pneumonia (J09-J18)', 'Chronic lower respiratory diseases (J40-J47)',
                       "Other diseases of respiratory system (J00-J06,J30-J39,J67,J70-J98)","Nephritis, nephrotic syndrome and nephrosis (N00-N07,N17-N19,N25-N27)","Symptoms, signs and abnormal clinical and laboratory findings, not elsewhere classified (R00-R99)",
                       "Diseases of heart (I00-I09,I11,I13,I20-I51)",'Cerebrovascular diseases (I60-I69)',"COVID-19 (U071, Multiple Cause of Death)","COVID-19 (U071, Underlying Cause of Death)" ]

# Loop through the jurisdictions
for jurisdiction in jurisdictions:
    # Filter the data
    filtered_data = data[data['Jurisdiction of Occurrence'] == jurisdiction]

    # Loop through the columns of interest
    for column in columns_of_interest:
        # Preprocessing steps
        # Convert the 'Data As Of' column to datetime objects
        filtered_data['Data As Of'] = pd.to_datetime(filtered_data['Data As Of'], format='%m/%d/%Y')


        filtered_data = filtered_data.drop('Jurisdiction of Occurrence', axis=1)
        filtered_data = filtered_data.drop('Week Ending Date', axis=1)
        filtered_data.drop(filtered_data.columns[20:36], axis=1, inplace=True)
        filtered_data = filtered_data.drop('flag_allcause', axis=1)
        filtered_data = filtered_data.drop('flag_natcause', axis=1)
       
       

# Extract year, month, and day as separate columns
        filtered_data['Year'] = filtered_data['Data As Of'].dt.year
        filtered_data['Month'] = filtered_data['Data As Of'].dt.month
        filtered_data['Day'] = filtered_data['Data As Of'].dt.day

# Drop the original 'Date' column if it's no longer needed
        filtered_data = filtered_data.drop('Data As Of', axis=1)

        # Fill missing values
        filtered_data = filtered_data.fillna(0)

        # Split the data into features (X) and target (y)
        X = filtered_data.drop(column, axis=1)
        y = filtered_data[column]

        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

        # Train the model
        dt_regressor = DecisionTreeRegressor(random_state=42)
        dt_regressor.fit(X_train, y_train)

        # Make predictions on the test set
        y_pred = dt_regressor.predict(X_test)

        # Calculate evaluation metrics
        mse = mean_squared_error(y_test, y_pred)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)

        # Create a DataFrame with the actual and predicted values
        predictions_df = pd.DataFrame({
            'Actual': y_test,
            'Predicted': y_pred,
            'MSE': mse,
            'RMSE': rmse,
            'MAE': mae,
            'R2': r2
        })

        # Add the jurisdiction name and column name to the DataFrame
        predictions_df['Jurisdiction'] = jurisdiction
        predictions_df['Column'] = column

        # Append the DataFrame to the list
        predictions_dfs.append(predictions_df)
        filtered_data = data[data['Jurisdiction of Occurrence'] == jurisdiction].reset_index(drop=True)


# Concatenate the predictions DataFrames into a single DataFrame
combined_predictions_df = pd.concat(predictions_dfs, ignore_index=True)

# Save the DataFrame to a CSV file
combined_predictions_df.to_csv("combined_predictions.csv", index=False)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Data As Of'] = pd.to_datetime(filtered_data['Data As Of'], format='%m/%d/%Y')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Data As Of'] = pd.to_datetime(filtered_data['Data As Of'], format='%m/%d/%Y')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_data['Data