In [1]:
import pandas as pd
# Load the CSV File
data = pd.read_csv("data.csv")
data.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,Datasource,Class,Topic,Question,Data_Value_Unit,Data_Value_Type,...,GeoLocation,ClassID,TopicID,QuestionID,DataValueTypeID,LocationID,StratificationCategory1,Stratification1,StratificationCategoryId1,StratificationID1
0,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Total,Total,OVR,OVERALL
1,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Gender,Male,GEN,MALE
2,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Gender,Female,GEN,FEMALE
3,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Education,Less than high school,EDU,EDUHS
4,2011,2011,AL,Alabama,Behavioral Risk Factor Surveillance System,Obesity / Weight Status,Obesity / Weight Status,Percent of adults aged 18 years and older who ...,,Value,...,"(32.84057112200048, -86.63186076199969)",OWS,OWS1,Q036,VALUE,1,Education,High school graduate,EDU,EDUHSGRAD


In [2]:
columns_to_keep = ['YearStart', 'Question', 'StratificationCategory1', 'Data_Value', 'Stratification1']

# Create a new DataFrame with only the selected columns
data = data[columns_to_keep]
data

Unnamed: 0,YearStart,Question,StratificationCategory1,Data_Value,Stratification1
0,2011,Percent of adults aged 18 years and older who ...,Total,32.0,Total
1,2011,Percent of adults aged 18 years and older who ...,Gender,32.3,Male
2,2011,Percent of adults aged 18 years and older who ...,Gender,31.8,Female
3,2011,Percent of adults aged 18 years and older who ...,Education,33.6,Less than high school
4,2011,Percent of adults aged 18 years and older who ...,Education,32.8,High school graduate
...,...,...,...,...,...
53387,2016,Percent of adults who engage in no leisure-tim...,Race/Ethnicity,,Asian
53388,2016,Percent of adults who engage in no leisure-tim...,Race/Ethnicity,,Hawaiian/Pacific Islander
53389,2016,Percent of adults who engage in no leisure-tim...,Race/Ethnicity,,American Indian/Alaska Native
53390,2016,Percent of adults who engage in no leisure-tim...,Race/Ethnicity,,2 or more races


In [3]:
# keep only rows where the StratificationCategory1 is Income
data = data[data['StratificationCategory1'] == 'Race/Ethnicity']
# drop rows with missing values
data = data.dropna()
# Drop the StratificationCategory1 column
data = data.drop('StratificationCategory1', axis=1)
# drop rows with Stratification1 = Other or Stratification1 = '2 or more races'
data = data[data['Stratification1'] != 'Other']
data = data[data['Stratification1'] != '2 or more races']

data

Unnamed: 0,YearStart,Question,Data_Value,Stratification1
20,2011,Percent of adults aged 18 years and older who ...,29.8,Non-Hispanic White
21,2011,Percent of adults aged 18 years and older who ...,40.1,Non-Hispanic Black
22,2011,Percent of adults aged 18 years and older who ...,28.6,Hispanic
25,2011,Percent of adults aged 18 years and older who ...,32.9,American Indian/Alaska Native
48,2011,Percent of adults aged 18 years and older who ...,36.4,Non-Hispanic White
...,...,...,...,...
53334,2016,Percent of adults who engage in no leisure-tim...,34.6,Hawaiian/Pacific Islander
53359,2016,Percent of adults who engage in no leisure-tim...,41.6,Hispanic
53384,2016,Percent of adults who engage in no leisure-tim...,18.3,Non-Hispanic White
53385,2016,Percent of adults who engage in no leisure-tim...,24.1,Non-Hispanic Black


In [4]:
# give % of null values in each column
data.isnull().sum() / len(data) * 100


YearStart          0.0
Question           0.0
Data_Value         0.0
Stratification1    0.0
dtype: float64

In [5]:
# Define a mapping from question text to question number
question_mapping = {
    'Percent of adults aged 18 years and older who have obesity': 0,
    'Percent of adults aged 18 years and older who have an overweight classification': 1,
    'Percent of adults who report consuming fruit less than one time daily': 2,
    'Percent of adults who report consuming vegetables less than one time daily': 3,
    'Percent of adults who achieve at least 150 minutes a week of moderate-intensity aerobic physical activity or 75 minutes a week of vigorous-intensity aerobic activity (or an equivalent combination)': 4,
    'Percent of adults who achieve at least 150 minutes a week of moderate-intensity aerobic physical activity or 75 minutes a week of vigorous-intensity aerobic physical activity and engage in muscle-...': 5,
    'Percent of adults who achieve at least 300 minutes a week of moderate-intensity aerobic physical activity or 150 minutes a week of vigorous-intensity aerobic activity (or an equivalent combination)': 6,
    'Percent of adults who engage in muscle-strengthening activities on 2 or more days a week': 7,
    'Percent of adults who engage in no leisure-time physical activity': 8,
}

# Create a new column with the question numbers
data['Question_Num'] = data['Question'].replace(question_mapping)

# Create the new columns based on the conditions
data['obesity'] = data['Question_Num'].apply(lambda x: 1 if x in [0, 1] else 0)
data['food'] = data['Question_Num'].apply(lambda x: 1 if x in [2, 3] else 0)
data['exercise'] = data['Question_Num'].apply(lambda x: 0.5 if x in [4, 5] else (1 if x == 6 else 0))
data['activity'] = data['Question_Num'].apply(lambda x: 1 if x == 7 else (-0.5 if x == 8 else 0))


# drop the Question and Question_Num columns
data = data.drop(['Question', 'Question_Num'], axis=1)

# Display the first few rows of the modified dataframe
data


Unnamed: 0,YearStart,Data_Value,Stratification1,obesity,food,exercise,activity
20,2011,29.8,Non-Hispanic White,1,0,0.0,0.0
21,2011,40.1,Non-Hispanic Black,1,0,0.0,0.0
22,2011,28.6,Hispanic,1,0,0.0,0.0
25,2011,32.9,American Indian/Alaska Native,1,0,0.0,0.0
48,2011,36.4,Non-Hispanic White,1,0,0.0,0.0
...,...,...,...,...,...,...,...
53334,2016,34.6,Hawaiian/Pacific Islander,0,0,0.0,-0.5
53359,2016,41.6,Hispanic,0,0,0.0,-0.5
53384,2016,18.3,Non-Hispanic White,0,0,0.0,-0.5
53385,2016,24.1,Non-Hispanic Black,0,0,0.0,-0.5


In [6]:
from sklearn.preprocessing import OneHotEncoder
ohe = OneHotEncoder(sparse=False)

# encode Stratification1 using OneHotEncoder
stratification1_encoded = ohe.fit_transform(data[['Stratification1']])
stratification1_encoded = pd.DataFrame(stratification1_encoded, columns=ohe.get_feature_names(['Stratification1']), index=data.index)

# drop the original column
data = data.drop('Stratification1', axis=1)

# concatenate the original DataFrame with the encoded DataFrame
data = pd.concat([data, stratification1_encoded], axis=1)

# encode YearStart using LabelEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
data['YearStart'] = le.fit_transform(data['YearStart'])

data




Unnamed: 0,YearStart,Data_Value,obesity,food,exercise,activity,Stratification1_American Indian/Alaska Native,Stratification1_Asian,Stratification1_Hawaiian/Pacific Islander,Stratification1_Hispanic,Stratification1_Non-Hispanic Black,Stratification1_Non-Hispanic White
20,0,29.8,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
21,0,40.1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22,0,28.6,1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25,0,32.9,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
48,0,36.4,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
53334,5,34.6,0,0,0.0,-0.5,0.0,0.0,1.0,0.0,0.0,0.0
53359,5,41.6,0,0,0.0,-0.5,0.0,0.0,0.0,1.0,0.0,0.0
53384,5,18.3,0,0,0.0,-0.5,0.0,0.0,0.0,0.0,0.0,1.0
53385,5,24.1,0,0,0.0,-0.5,0.0,0.0,0.0,0.0,1.0,0.0


In [158]:
list(data.columns)

['YearStart',
 'Data_Value',
 'obesity',
 'food',
 'exercise',
 'activity',
 'Stratification1_American Indian/Alaska Native',
 'Stratification1_Asian',
 'Stratification1_Hawaiian/Pacific Islander',
 'Stratification1_Hispanic',
 'Stratification1_Non-Hispanic Black',
 'Stratification1_Non-Hispanic White']

In [159]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Define the features and the target variable
features = [
    'obesity', 'food', 'exercise', 'activity', 'YearStart',
    'Stratification1_American Indian/Alaska Native',
    'Stratification1_Asian',
    'Stratification1_Hawaiian/Pacific Islander',
    'Stratification1_Hispanic',
    'Stratification1_Non-Hispanic Black',
    'Stratification1_Non-Hispanic White']
target = 'Data_Value'

# Separate the features and the target variable
X = data[features]
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest regressor
model_rf = RandomForestRegressor(random_state=42)

# Train the Random Forest regressor
model_rf.fit(X_train, y_train)

# Predict the target variable on the testing set
y_pred = model_rf.predict(X_test)

# Calculate regression metrics to evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MAE: {mae}')
print(f'MSE: {mse}')
print(f'R² Score: {r2}')


MAE: 5.704471916355501
MSE: 53.04303340328741
R² Score: 0.5141358926675013


In [160]:
# training model without YearStart column
# drop YearStart column
data = data.drop(['YearStart'], axis=1)
data

Unnamed: 0,Data_Value,obesity,food,exercise,activity,Stratification1_American Indian/Alaska Native,Stratification1_Asian,Stratification1_Hawaiian/Pacific Islander,Stratification1_Hispanic,Stratification1_Non-Hispanic Black,Stratification1_Non-Hispanic White
20,29.8,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
21,40.1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
22,28.6,1,0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25,32.9,1,0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
48,36.4,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...
53334,34.6,0,0,0.0,-0.5,0.0,0.0,1.0,0.0,0.0,0.0
53359,41.6,0,0,0.0,-0.5,0.0,0.0,0.0,1.0,0.0,0.0
53384,18.3,0,0,0.0,-0.5,0.0,0.0,0.0,0.0,0.0,1.0
53385,24.1,0,0,0.0,-0.5,0.0,0.0,0.0,0.0,1.0,0.0


In [161]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

features = [
    'obesity', 'food', 'exercise', 'activity',
    'Stratification1_American Indian/Alaska Native',
    'Stratification1_Asian',
    'Stratification1_Hawaiian/Pacific Islander',
    'Stratification1_Hispanic',
    'Stratification1_Non-Hispanic Black',
    'Stratification1_Non-Hispanic White']
target = 'Data_Value'

# Separate the features and the target variable
X = data[features]
y = data[target]

# Split the data into training and testing sets before standardizing the "Data_Value"
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the "Data_Value" in the training and testing sets
scaler_y = StandardScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).ravel()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).ravel()

# Initialize the Random Forest regressor
model_rf = RandomForestRegressor(random_state=42)

# Train the Random Forest regressor using the standardized "Data_Value"
model_rf.fit(X_train, y_train_scaled)

# Predict the standardized "Data_Value" on the testing set
y_pred_scaled = model_rf.predict(X_test)

# Calculate regression metrics to evaluate the model
mae_scaled = mean_absolute_error(y_test_scaled, y_pred_scaled)
mse_scaled = mean_squared_error(y_test_scaled, y_pred_scaled)
r2_scaled = r2_score(y_test_scaled, y_pred_scaled)

mae_scaled, mse_scaled, r2_scaled


(0.5458981247755295, 0.4845694496727558, 0.532404894867998)