In [5]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Load the entire training dataset
train_dataset_path = 'train.csv'
train_data = pd.read_csv(train_dataset_path)

# Load the test dataset without the target variable
test_dataset_path = 'test.csv'
test_data = pd.read_csv(test_dataset_path)

# Assuming 'features' are the input features and 'target' is the Mohs hardness value
X_train = train_data.drop('Hardness', axis=1)
y_train = train_data['Hardness']

# Standardize the features (optional but often beneficial for regression models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train a RandomForestRegressor (you can use other regression algorithms)
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train_scaled, y_train)

# Preprocess the features of the test dataset
X_test = test_data  # Assuming the test dataset has the same features as the training dataset
X_test_scaled = scaler.transform(X_test)

# Make predictions on the test set
y_pred = regressor.predict(X_test_scaled)

# Save predictions to a CSV file
submission_df = pd.DataFrame({'id': test_data['id'], 'Hardness': y_pred})
submission_df.to_csv('submission.csv', index=False)


In [8]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Load the entire training dataset
# Replace 'train_dataset.csv' with the actual file path or URL to your training dataset
train_dataset_path = 'train.csv'
train_data = pd.read_csv(train_dataset_path)

# Load the test dataset without the target variable
# Replace 'test_dataset.csv' with the actual file path or URL to your test dataset
test_dataset_path = 'test.csv'
test_data = pd.read_csv(test_dataset_path)

# Assuming 'features' are the input features and 'target' is the Mohs hardness value
X_train = train_data.drop('Hardness', axis=1)
y_train = train_data['Hardness']

# Standardize the features (optional but often beneficial for regression models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Split the data into training and validation sets for model evaluation
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train_scaled, y_train, test_size=0.2, random_state=42
)

# Train a Gradient Boosting Regressor
regressor = GradientBoostingRegressor(n_estimators=100, random_state=42)
regressor.fit(X_train_split, y_train_split)

# Make predictions on the validation set
y_val_pred = regressor.predict(X_val_split)

# Evaluate the model on the validation set
mse_val = mean_squared_error(y_val_split, y_val_pred)
print(f'Mean Squared Error on Validation Set: {mse_val:.2f}')

# Preprocess the features of the test dataset
X_test = test_data  # Assuming the test dataset has the same features as the training dataset
X_test_scaled = scaler.transform(X_test)

# Make predictions on the test set
y_pred = regressor.predict(X_test_scaled)

# Save predictions to a CSV file
submission_df = pd.DataFrame({'id': test_data['id'], 'Hardness': y_pred})
submission_df.to_csv('submission1.csv', index=False)


Mean Squared Error on Validation Set: 1.50


In [9]:
import pandas as pd
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

# Load the entire training dataset
# Replace 'train_dataset.csv' with the actual file path or URL to your training dataset
train_dataset_path = 'train.csv'
train_data = pd.read_csv(train_dataset_path)

# Load the test dataset without the target variable
# Replace 'test_dataset.csv' with the actual file path or URL to your test dataset
test_dataset_path = 'test.csv'
test_data = pd.read_csv(test_dataset_path)

# Assuming 'features' are the input features and 'target' is the Mohs hardness value
X_train = train_data.drop('Hardness', axis=1)
y_train = train_data['Hardness']

# Standardize the features (optional but often beneficial for regression models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train a Gradient Boosting Regressor
regressor = GradientBoostingRegressor(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

# Make predictions on the test set
X_test = test_data  # Assuming the test dataset has the same features as the training dataset
X_test_scaled = scaler.transform(X_test)
y_pred = grid_search.predict(X_test_scaled)

# Save predictions to a CSV file
submission_df = pd.DataFrame({'id': test_data['id'], 'Hardness': y_pred})
submission_df.to_csv('submission2.csv', index=False)


Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 50}


In [11]:
!pip3 install xgboost

Defaulting to user installation because normal site-packages is not writeable
Collecting xgboost
  Downloading xgboost-2.0.2-py3-none-macosx_12_0_arm64.whl (1.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.9/1.9 MB[0m [31m17.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Installing collected packages: xgboost
Successfully installed xgboost-2.0.2

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1.2[0m[39;49m -> [0m[32;49m23.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/Library/Developer/CommandLineTools/usr/bin/python3 -m pip install --upgrade pip[0m


In [12]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

# Load the entire training dataset
# Replace 'train_dataset.csv' with the actual file path or URL to your training dataset
train_dataset_path = 'train.csv'
train_data = pd.read_csv(train_dataset_path)

# Load the test dataset without the target variable
# Replace 'test_dataset.csv' with the actual file path or URL to your test dataset
test_dataset_path = 'test.csv'
test_data = pd.read_csv(test_dataset_path)

# Assuming 'features' are the input features and 'target' is the Mohs hardness value
X_train = train_data.drop('Hardness', axis=1)
y_train = train_data['Hardness']

# Standardize the features (optional but often beneficial for regression models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train an XGBoost Regressor
regressor = XGBRegressor(random_state=42)

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)

# Print the best hyperparameters
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')

# Make predictions on the test set
X_test = test_data  # Assuming the test dataset has the same features as the training dataset
X_test_scaled = scaler.transform(X_test)
y_pred = grid_search.predict(X_test_scaled)

# Save predictions to a CSV file
submission_df = pd.DataFrame({'id': test_data['id'], 'Hardness': y_pred})
submission_df.to_csv('submission3.csv', index=False)


Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150}


In [19]:
best_params = {'learning_rate': 0.1, 'max_depth': 4, 'n_estimators': 150}
regressor2 = XGBRegressor(random_state=42, **best_params)
regressor2.fit(X_train_scaled, y_train)
X_test = test_data  # Assuming the test dataset has the same features as the training dataset
X_test_scaled = scaler.transform(X_test)
y_pred_xgb = grid_search.predict(X_test_scaled)

# Save predictions to a CSV file
# submission_df = pd.DataFrame({'id': test_data['id'], 'Hardness': y_pred})
# submission_df.to_csv('submission4.csv', index=False)




In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler

# Load the entire training dataset
train_dataset_path = 'train.csv'
train_data = pd.read_csv(train_dataset_path)

# Load the test dataset without the target variable
test_dataset_path = 'test.csv'
test_data = pd.read_csv(test_dataset_path)

# Assuming 'features' are the input features and 'target' is the Mohs hardness value
X_train = train_data.drop('Hardness', axis=1)
y_train = train_data['Hardness']

# Define the hyperparameter grid to search
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
}


# Standardize the features (optional but often beneficial for regression models)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Train a RandomForestRegressor (you can use other regression algorithms)
regressor = RandomForestRegressor(n_estimators=100, random_state=42)
# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(regressor, param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
best_params = grid_search.best_params_
print(f'Best Hyperparameters: {best_params}')
regressor1=RandomForestRegressor(random_state=42,**best_params)
regressor1.fit(X_train_scaled, y_train)

# Preprocess the features of the test dataset
X_test = test_data  # Assuming the test dataset has the same features as the training dataset
X_test_scaled = scaler.transform(X_test)

# Make predictions on the test set
y_pred = regressor1.predict(X_test_scaled)

# Save predictions to a CSV file
submission_df = pd.DataFrame({'id': test_data['id'], 'Hardness': y_pred})
submission_df.to_csv('submission5.csv', index=False)


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(


Best Hyperparameters: {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 150}


In [20]:
ensemble_preds = 0.5 * y_pred + 0.5 * y_pred_xgb
submission_df = pd.DataFrame({'id': test_data['id'], 'Hardness': y_pred})
submission_df.to_csv('submission6.csv', index=False)


In [25]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error

# Load and preprocess your data
# ...

train_data1=train_data
X=train_data1.drop("Hardness",axis=1)
y=train_data1["Hardness"]
# Split the data
X_train_splitted, X_val_splitted, y_train_splitted, y_val_splitted = train_test_split(X,y,test_size=0.2, random_state=42)

# Define a grid of hyperparameters for different regressors
param_grid = {
    'RandomForestRegressor': {
        'n_estimators': [50, 100, 150],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'max_features': ['auto', 'sqrt', 'log2'],
    },
    'GradientBoostingRegressor': {
        'n_estimators': [50, 100, 150],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 4, 5],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
    },
    'LinearRegression': {},
}

# Create regressor instances
regressors = {
    'RandomForestRegressor': RandomForestRegressor(random_state=42),
    'GradientBoostingRegressor': GradientBoostingRegressor(random_state=42),
    'LinearRegression': LinearRegression(),
}

# Perform GridSearchCV for each regressor
best_models = {}
for regressor_name, regressor in regressors.items():
    grid_search = GridSearchCV(regressor, param_grid[regressor_name], cv=5, scoring='neg_mean_squared_error')
    grid_search.fit(X_train_splitted, y_train_splitted)
    best_models[regressor_name] = grid_search.best_estimator_

# Evaluate each model on the validation set
mse_results = {}
for regressor_name, model in best_models.items():
    preds = model.predict(X_val_splitted)
    mse = mean_squared_error(y_val_splitted, preds)
    mse_results[regressor_name] = mse
    print(f'Mean Squared Error for {regressor_name}: {mse:.2f}')

# Identify the model with the minimum MSE
best_regressor = min(mse_results, key=mse_results.get)
print(f'The best regressor is: {best_regressor}')


  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
  warn(
