In [19]:
from tpot import TPOTRegressor
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
from pathlib import Path

# Loading data
cpi_file = r'CPI Data - Sheet1.csv'
cpi = pd.read_csv(cpi_file)

# Convert 'DATE' to datetime
cpi['DATE'] = pd.to_datetime(cpi['DATE'], errors='coerce')

# Convert date to a float representation (e.g., YYYYMMDD)
#cpi['date_as_float'] = cpi['DATE'].apply(lambda x: float(x.strftime('%Y%m%d')) if pd.notnull(x) else x)

# Get dummies and encode 'President' columns as 0 and 1
df_encoded = pd.get_dummies(cpi, columns=['President'])

# Convert boolean values to integers (0 and 1)
df_encoded[['President_Dem', 'President_Rep']] = df_encoded[['President_Democrat', 'President_Republican']].astype(int)

# Drop the original 'President' columns
df_encoded = df_encoded.drop(['President_Democrat', 'President_Republican'], axis=1)


In [20]:
df_encoded

Unnamed: 0,DATE,Crude,natural_gas,Cocoa,Coffee,Soybeans,soybean_oil,soybean_meal,Maize,Wheat,...,Cotton,Rubber,Phosphate,Aluminum,Iron ore,Copper,Gold,CPI,President_Dem,President_Rep
0,1960-01-01,1.63,0.14,0.63,0.94,94.00,204.00,91.90,45.00,59.89,...,0.65,0.82,13.00,511.47,11.42,715.40,35.27,29.370,0,1
1,1960-02-01,1.63,0.14,0.61,0.95,91.00,201.00,86.70,44.00,60.99,...,0.65,0.83,13.00,511.47,11.42,728.19,35.27,29.410,0,1
2,1960-03-01,1.63,0.14,0.58,0.93,92.00,201.00,84.10,45.00,61.73,...,0.65,0.86,13.00,511.47,11.42,684.94,35.27,29.410,0,1
3,1960-04-01,1.63,0.14,0.60,0.93,93.00,207.00,86.70,45.00,60.99,...,0.64,0.86,13.00,511.47,11.42,723.11,35.27,29.540,0,1
4,1960-05-01,1.63,0.14,0.60,0.92,93.00,209.00,81.50,48.00,57.69,...,0.65,0.93,13.00,511.47,11.42,684.75,35.27,29.570,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
762,2023-07-01,78.98,2.55,3.39,4.27,633.85,1135.74,517.23,242.38,345.50,...,2.05,1.49,342.50,2159.73,114.43,8476.68,1951.02,304.348,1,0
763,2023-08-01,84.72,2.58,3.46,4.12,583.93,1126.53,514.56,207.61,315.82,...,2.11,1.47,346.25,2136.57,110.20,8349.13,1918.70,306.269,1,0
764,2023-09-01,92.22,2.64,3.61,4.05,619.04,1111.63,509.31,223.81,314.68,...,2.16,1.55,347.50,2184.67,120.98,8276.71,1915.95,307.481,1,0
765,2023-10-01,89.08,2.99,3.63,4.05,529.57,1133.87,511.73,230.70,298.10,...,2.11,1.61,347.50,2192.21,118.97,7937.18,1916.25,307.619,1,0


In [4]:
features = df_encoded[[
     'Crude', 'natural_gas', 'Cocoa', 'Coffee', 'Soybeans', 'soybean_oil',
    'soybean_meal', 'Maize', 'Wheat', 'Banana', 'Orange', 'Beef', 'Chicken',
    'Sugar', 'Cotton', 'Rubber', 'Phosphate', 'Aluminum', 'Iron ore', 'Copper',
    'Gold', 'President_Dem','President_Rep']]
#.astype(float)

target_variable = df_encoded['CPI']

In [5]:
#print("Shape of your_features:", features.shape)
     
print(features.head())
print(target_variable.head())

   Crude  natural_gas  Cocoa  Coffee  Soybeans  soybean_oil  soybean_meal  \
0   1.63         0.14   0.63    0.94      94.0        204.0          91.9   
1   1.63         0.14   0.61    0.95      91.0        201.0          86.7   
2   1.63         0.14   0.58    0.93      92.0        201.0          84.1   
3   1.63         0.14   0.60    0.93      93.0        207.0          86.7   
4   1.63         0.14   0.60    0.92      93.0        209.0          81.5   

   Maize  Wheat  Banana  ...  Sugar  Cotton  Rubber  Phosphate  Aluminum  \
0   45.0  59.89    0.14  ...   0.12    0.65    0.82       13.0    511.47   
1   44.0  60.99    0.14  ...   0.12    0.65    0.83       13.0    511.47   
2   45.0  61.73    0.14  ...   0.12    0.65    0.86       13.0    511.47   
3   45.0  60.99    0.14  ...   0.12    0.64    0.86       13.0    511.47   
4   48.0  57.69    0.15  ...   0.12    0.65    0.93       13.0    511.47   

   Iron ore  Copper   Gold  President_Dem  President_Rep  
0     11.42  715.40  

In [6]:
# Convert target_variable to a one-dimensional array
target_variable = np.squeeze(target_variable)

# Define numeric and categorical features
numeric_features = features.select_dtypes(include=['float64', 'int64']).columns
categorical_features = features.select_dtypes(include=['object']).columns

X_train, X_test, y_train, y_test = train_test_split(features, target_variable,
                                                    train_size=0.75, test_size=0.25, random_state=42)



tpot = TPOTRegressor(scoring='neg_mean_squared_error',generations=10, population_size=50, verbosity=2, random_state=42)
tpot.fit(X_train, y_train)
print(tpot.score(X_test, y_test))
tpot.export('cpi_output.py')

Optimization Progress:   0%|          | 0/550 [00:00<?, ?pipeline/s]


Generation 1 - Current best internal CV score: -8.531941767012917

Generation 2 - Current best internal CV score: -6.942828066665527

Generation 3 - Current best internal CV score: -6.942828066665527

Generation 4 - Current best internal CV score: -6.942828066665527

Generation 5 - Current best internal CV score: -6.2084700415449765

Generation 6 - Current best internal CV score: -6.2084700415449765

Generation 7 - Current best internal CV score: -3.9408958216301917

Generation 8 - Current best internal CV score: -3.9408958216301917

Generation 9 - Current best internal CV score: -3.9408958216301917

Generation 10 - Current best internal CV score: -3.7106600813535024

Best pipeline: ExtraTreesRegressor(RidgeCV(MaxAbsScaler(input_matrix)), bootstrap=False, max_features=0.7500000000000001, min_samples_leaf=2, min_samples_split=3, n_estimators=100)
-3.5600757037496717
