In [7]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# Define the number of entries to simulate
num_entries = 100

# Generate random dates within a range for the survey
start_date = datetime.strptime('1/1/2018', '%m/%d/%Y')
end_date = datetime.strptime('8/1/2024', '%m/%d/%Y')

# Regions for simulation
regions = ['East Coast', 'West Coast', 'Upper East Coast', 'Upper West Coast']

# Function to generate a random date within a range
def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

# Initialize lists to store data
survey_id = list(range(1, num_entries + 1))
region = np.random.choice(regions, num_entries)
survey_date = [random_date(start_date, end_date).strftime('%m/%d/%Y') for _ in range(num_entries)]
latitude = np.random.uniform(-47.0, -34.0, num_entries)
longitude = np.random.uniform(166.0, 179.0, num_entries)

# Generate rubbish counts based on regions
rubbish_count = []
for r in region:
    if r == 'Upper East Coast':
        rubbish_count.append(np.random.normal(loc=200, scale=20))
    elif r == 'East Coast':
        rubbish_count.append(np.random.normal(loc=125, scale=25))
    elif r == 'Upper West Coast':
        rubbish_count.append(np.random.normal(loc=125, scale=25))
    else:  # West Coast
        rubbish_count.append(np.random.normal(loc=50, scale=10))

# Initialize count_same and date_average
count_same = [112] * num_entries
date_average = np.random.uniform(0, 5, num_entries)

# Simulate ocean current data
np.random.seed(42)  # For reproducibility
current_speed = np.random.normal(loc=0.5, scale=0.1, size=num_entries)  # Current speed in m/s
current_direction = np.random.uniform(low=0, high=360, size=num_entries)  # Direction in degrees

# Reduce rubbish count if current direction points away
for i in range(num_entries):
    if 90 < current_direction[i] < 270:  # Assuming away is between East (90) and West (270)
        rubbish_count[i] *= 0.8

# Simulate water temperature (with different means for regions)
temperature = []
for r in region:
    if r == 'Upper East Coast':
        temperature.append(np.random.normal(loc=18, scale=1))
    elif r == 'East Coast':
        temperature.append(np.random.normal(loc=16, scale=1))
    elif r == 'Upper West Coast':
        temperature.append(np.random.normal(loc=17, scale=1))
    else:  # West Coast
        temperature.append(np.random.normal(loc=14, scale=1))

# Create DataFrame
simulated_data = {
    'SurveyId': survey_id,
    'Region': region,
    'SurveyDate': survey_date,
    'SurveyAreaLatitudeStart': latitude,
    'SurveyAreaLongitudeStart': longitude,
    'Rubbish_Count': rubbish_count,
    'count_same': count_same,
    'date average': date_average,
    'Current_Speed': current_speed,
    'Current_Direction': current_direction,
    'Water_Temperature': temperature
}

df_simulated = pd.DataFrame(simulated_data)

# Save to CSV
df_simulated.to_csv('simulated_nz_ocean_current_with_regions.csv', index=False)

print(df_simulated)


    SurveyId            Region  SurveyDate  SurveyAreaLatitudeStart  \
0          1  Upper West Coast  07/20/2024               -42.839355   
1          2  Upper West Coast  10/28/2022               -35.358198   
2          3  Upper East Coast  11/26/2018               -41.940378   
3          4        West Coast  10/13/2018               -46.859111   
4          5        East Coast  03/10/2020               -35.230034   
..       ...               ...         ...                      ...   
95        96        East Coast  12/23/2020               -34.179260   
96        97        West Coast  08/23/2020               -37.923898   
97        98  Upper West Coast  03/11/2021               -40.030747   
98        99  Upper West Coast  01/05/2023               -42.976141   
99       100        East Coast  03/16/2020               -36.420665   

    SurveyAreaLongitudeStart  Rubbish_Count  count_same  date average  \
0                 174.901505      97.859393         112      1.008137   
1

In [8]:
# Convert to DataFrame
df_simulated = pd.DataFrame(simulated_data)

# Encode the 'Region' column
le = LabelEncoder()
df_simulated['Region'] = le.fit_transform(df_simulated['Region'])

# Split into features and target
X = df_simulated.drop(columns=['Rubbish_Count','SurveyDate','count_same','date average','SurveyAreaLatitudeStart','SurveyAreaLongitudeStart','SurveyId'])
y = df_simulated['Rubbish_Count']

print(X)

# Split into training and test sets
xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(xtrain)
X_test_scaled = scaler.transform(xtest)

# Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train_scaled, ytrain)
y_pred_linear = linear_model.predict(X_test_scaled)

# Evaluate Linear Regression
mse_linear = mean_squared_error(ytest, y_pred_linear)
r2_linear = r2_score(ytest, y_pred_linear)

print(f"Linear Regression - Mean Squared Error: {mse_linear:.2f}, R^2 Score: {r2_linear:.2f}")

# Random Forest Model
random_forest_model = RandomForestRegressor(random_state=42, n_estimators=100)
random_forest_model.fit(X_train_scaled, ytrain)
y_pred_rf = random_forest_model.predict(X_test_scaled)

# Evaluate Random Forest
mse_rf = mean_squared_error(ytest, y_pred_rf)
r2_rf = r2_score(ytest, y_pred_rf)

print(f"Random Forest - Mean Squared Error: {mse_rf:.2f}, R^2 Score: {r2_rf:.2f}")


    Region  Current_Speed  Current_Direction  Water_Temperature
0        2       0.549671         150.267961          17.013002
1        2       0.486174          79.958812          18.453534
2        1       0.564769          43.151532          17.735343
3        3       0.652303         121.541462          16.720169
4        0       0.476585         339.447493          16.625667
..     ...            ...                ...                ...
95       0       0.353649          87.836272          14.564138
96       3       0.529612         350.283800          15.163164
97       2       0.526106         141.515181          17.010233
98       2       0.500511         321.136760          16.018491
99       0       0.476541         227.209905          16.462103

[100 rows x 4 columns]
Linear Regression - Mean Squared Error: 2396.40, R^2 Score: 0.40
Random Forest - Mean Squared Error: 894.89, R^2 Score: 0.78
