In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.svm import SVR
# Generate synthetic data
np.random.seed(42)  # For reproducibility
n_samples = 1000
data = pd.DataFrame({
    'reviewFrequency': np.random.randint(1, 31, n_samples),  # Frequency range: 1 to 30
    'isPlusMember': np.random.choice([True, False], n_samples),
    'isVerifiedPurchase': np.random.choice([True, False], n_samples)
})

# Define trust score based on parameters
def calculate_trust_score(row):
    trust_score = 0
    
    # Review Frequency
    if 1 <= row['reviewFrequency'] <= 10:
        trust_score += np.random.uniform(1, 2)
    elif 11 <= row['reviewFrequency'] <= 20:
        trust_score += np.random.uniform(2, 3)
    else:
        trust_score += np.random.uniform(3, 4)
    
    # + Membership
    if row['isPlusMember']:
        trust_score += np.random.uniform(1, 2.5)  # Maximum impact
        
    # Verified Purchase
    if not row['isVerifiedPurchase']:
        trust_score -= np.random.uniform(1.5, 2.5)  # Decrease trust significantly
    
    # Cap trust score at 5
    return min(trust_score, 5)

# Apply the trust score calculation to the DataFrame
data['trustScore'] = data.apply(calculate_trust_score, axis=1)
#data.to_csv('generated_dataset.csv', index=False)

# Display the dataset
data


Unnamed: 0,reviewFrequency,isPlusMember,isVerifiedPurchase,trustScore
0,7,True,True,2.953275
1,20,True,True,4.588733
2,29,False,False,1.039565
3,15,True,True,4.700953
4,11,False,True,2.483546
...,...,...,...,...
995,21,True,False,2.294493
996,5,True,True,3.147532
997,10,True,False,0.928829
998,10,False,True,1.505509


In [4]:
import numpy as np
import pandas as pd
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Generate synthetic dataset
n_samples = 1000

# Adjust parameters to create more variation
X, _ = make_regression(n_samples=n_samples, n_features=2, noise=0.1, random_state=42)
data = pd.DataFrame(X, columns=['deadlines_missed', 'years_supplying'])

# Introduce additional randomness to create more variation
np.random.seed(42)
data['years_supplying'] = np.random.randint(1, 51, size=n_samples)
data['deadlines_missed'] = np.random.randint(1, 11, size=n_samples)

# Define the timely supply score function
def calculate_timely_supply_score(deadlines_missed, years_supplying):
    # Define your scoring formula based on domain knowledge
    impact_of_deadlines = 1.0 / (1.0 + years_supplying / 10.0)  # Decreasing impact with supply duration
    score = np.clip(5 - (deadlines_missed * impact_of_deadlines), 1, 5)
    return score

# Calculate timely supply scores
data['timely_supply_score'] = data.apply(lambda row: calculate_timely_supply_score(row['deadlines_missed'], row['years_supplying']), axis=1)

# Split data into features (X) and target (y)
X = data[['deadlines_missed', 'years_supplying']]
y = data['timely_supply_score']

# Split into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.3, random_state=42)

# Initialize and train the Random Forest Regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_valid)

# Calculate Mean Squared Error (MSE)
mse = mean_squared_error(y_valid, y_pred)
#print("Mean Squared Error:", mse)
data
#average_timely_supply_score = data['timely_supply_score'].mean()
#print("Average Timely Supply Score:", average_timely_supply_score)




Unnamed: 0,deadlines_missed,years_supplying,timely_supply_score
0,9,39,3.163265
1,1,29,4.743590
2,5,15,3.000000
3,6,43,3.867925
4,5,8,2.222222
...,...,...,...
995,2,26,4.444444
996,1,34,4.772727
997,10,45,3.181818
998,8,6,1.000000
