In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
# Example DataFrame
df = pd.DataFrame({
    'feature1': np.random.rand(100),
    'feature2': np.random.rand(100),
    'target': np.random.rand(100)  # Replace with your target variable
})

# Define features and target
X = df[['feature1', 'feature2']]  # Replace with your features
y = df['target']  # Replace with your target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [2]:
from xgboost import XGBRegressor

# Initialize the model
model = XGBRegressor()

# Train the model
model.fit(X_train, y_train)


In [3]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions
predictions = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")


Mean Squared Error: 0.14
R-squared: -0.54


In [4]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 6]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=XGBRegressor(), param_grid=param_grid, cv=3)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)


Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}


In [5]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(XGBRegressor(), X, y, cv=5)

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean()}")


Cross-validation scores: [-0.49041498 -0.70144624 -1.41055004 -1.33894729 -0.53501691]
Mean CV score: -0.8952750929228808


In [6]:
import joblib

# Save the model
joblib.dump(model, 'xgboost_model.pkl')


['xgboost_model.pkl']

In [7]:
print(X_train.dtypes)


feature1    float64
feature2    float64
dtype: object


In [8]:
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        print(f"Column '{col}' has non-numeric values")


In [9]:
if 'numeric_column' in X_train.columns:
    X_train['numeric_column'] = X_train['numeric_column'].apply(clean_numeric)
if 'numeric_column' in X_test.columns:
    X_test['numeric_column'] = X_test['numeric_column'].apply(clean_numeric)
    

In [10]:
def clean_numeric(value):
    try:
        return float(value.split()[0])
    except ValueError:
        return np.nan  # Handle missing or non-numeric values

# Check if the column exists in X_train and X_test
if 'numeric_column' in X_train.columns:
    X_train['numeric_column'] = X_train['numeric_column'].apply(clean_numeric)

if 'numeric_column' in X_test.columns:
    X_test['numeric_column'] = X_test['numeric_column'].apply(clean_numeric)


In [11]:
X_train.fillna(0, inplace=True)  # Example: fill missing values with 0
X_test.fillna(0, inplace=True)


In [12]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Step 1: Load the datasets
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Display the first few rows of the datasets for reference
print(train_df.head())
print(test_df.head())

# Identify non-numeric columns
non_numeric_cols = ['image_link', 'entity_name']  # Example of non-numeric columns

# Convert non-numeric columns to numeric using Label Encoding
label_encoders = {}
for col in non_numeric_cols:
    le = LabelEncoder()
    # Fit the encoder only on the training data
    le.fit(train_df[col].astype(str))
    # Transform both training and test data
    train_df[col] = le.transform(train_df[col].astype(str))
    # Ensure all possible labels are known by the encoder
    test_df[col] = test_df[col].astype(str)
    # Use a mapping to handle unseen labels in test data
    test_df[col] = test_df[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
    label_encoders[col] = le

# Handle numeric columns with non-numeric values
def clean_numeric(value):
    try:
        return float(value.split()[0])
    except ValueError:
        return np.nan

numeric_columns = ['some_numeric_column']  # Replace with actual numeric columns
for col in numeric_columns:
    train_df[col] = train_df[col].apply(clean_numeric)
    test_df[col] = test_df[col].apply(clean_numeric)

# Handle missing values if necessary
train_df.fillna(0, inplace=True)
test_df.fillna(0, inplace=True)

# Prepare the data for training
X = train_df.drop(columns=['entity_value'])
y = train_df['entity_value']

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=44)

# Initialize and train the model
model = XGBRegressor()
model.fit(X_train, y_train)

# Step 3: Generate predictions
X_test = test_df.drop(columns=['id'])  # Dropping 'id' if it's not a feature
predictions = model.predict(X_test)

# Convert predictions to a DataFrame
predictions_df = pd.DataFrame(predictions, columns=['entity_value'])

# Ensure predictions are in the same format as sample_test_out.csv
# For this example, we'll assume predictions need to be formatted as strings
predictions_df['entity_value'] = predictions_df['entity_value'].apply(lambda x: f"{x:.2f}")

# Step 4: Prepare the submission file
submission_df = test_df[['id']].copy()
submission_df['entity_value'] = predictions_df['entity_value']
submission_df.to_csv('test_out.csv', index=False)


                                          image_link  group_id  entity_name  \
0  https://m.media-amazon.com/images/I/61I9XdN6OF...    748919  item_weight   
1  https://m.media-amazon.com/images/I/71gSRbyXmo...    916768  item_volume   
2  https://m.media-amazon.com/images/I/61BZ4zrjZX...    459516  item_weight   
3  https://m.media-amazon.com/images/I/612mrlqiI4...    459516  item_weight   
4  https://m.media-amazon.com/images/I/617Tl40LOX...    731432  item_weight   

     entity_value  
0      500.0 gram  
1         1.0 cup  
2      0.709 gram  
3      0.709 gram  
4  1400 milligram  
   index                                         image_link  group_id  \
0      0  https://m.media-amazon.com/images/I/110EibNycl...    156839   
1      1  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
2      2  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
3      3  https://m.media-amazon.com/images/I/11TU2clswz...    792578   
4      4  https://m.media-amazon.com/ima