In [43]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
# Example DataFrame
df = pd.DataFrame({
    'feature1': np.random.rand(100),
    'feature2': np.random.rand(100),
    'target': np.random.rand(100)  # Replace with your target variable
})

# Define features and target
X = df[['feature1', 'feature2']]  # Replace with your features
y = df['target']  # Replace with your target variable

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [44]:
from xgboost import XGBRegressor

# Initialize the model
model = XGBRegressor()

# Train the model
model.fit(X_train, y_train)


In [45]:
from sklearn.metrics import mean_squared_error, r2_score

# Make predictions
predictions = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f"Mean Squared Error: {mse:.2f}")
print(f"R-squared: {r2:.2f}")


Mean Squared Error: 0.08
R-squared: -0.09


In [46]:
from sklearn.model_selection import GridSearchCV

# Define parameter grid
param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1],
    'max_depth': [3, 6]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(estimator=XGBRegressor(), param_grid=param_grid, cv=3)

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)


Best parameters: {'learning_rate': 0.01, 'max_depth': 3, 'n_estimators': 100}


In [47]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(XGBRegressor(), X, y, cv=5)

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean CV score: {cv_scores.mean()}")


Cross-validation scores: [-0.72723594 -0.48103313 -0.30770242 -1.16591082 -0.96424684]
Mean CV score: -0.7292258298841906


In [48]:
import joblib

# Save the model
joblib.dump(model, 'xgboost_model.pkl')


['xgboost_model.pkl']

In [49]:
print(X_train.dtypes)


feature1    float64
feature2    float64
dtype: object


In [50]:
for col in X_train.columns:
    if X_train[col].dtype == 'object':
        print(f"Column '{col}' has non-numeric values")


In [51]:
if 'numeric_column' in X_train.columns:
    X_train['numeric_column'] = X_train['numeric_column'].apply(clean_numeric)
if 'numeric_column' in X_test.columns:
    X_test['numeric_column'] = X_test['numeric_column'].apply(clean_numeric)
    

In [52]:
def clean_numeric(value):
    try:
        return float(value.split()[0])
    except ValueError:
        return np.nan  # Handle missing or non-numeric values

# Check if the column exists in X_train and X_test
if 'numeric_column' in X_train.columns:
    X_train['numeric_column'] = X_train['numeric_column'].apply(clean_numeric)

if 'numeric_column' in X_test.columns:
    X_test['numeric_column'] = X_test['numeric_column'].apply(clean_numeric)


In [53]:
X_train.fillna(0, inplace=True)  # Example: fill missing values with 0
X_test.fillna(0, inplace=True)


In [54]:
# import pandas as pd
# import numpy as np
# from xgboost import XGBRegressor
# from sklearn.model_selection import train_test_split
# from sklearn.preprocessing import LabelEncoder

# # Step 1: Load the datasets
# train_df = pd.read_csv('train.csv')
# test_df = pd.read_csv('test.csv')

# # Display the first few rows of the datasets for reference
# print(train_df.head())
# print(test_df.head())

# # Identify non-numeric columns
# non_numeric_cols = ['image_link', 'entity_name']  # Example of non-numeric columns

# # Convert non-numeric columns to numeric using Label Encoding
# label_encoders = {}
# for col in non_numeric_cols:
#     le = LabelEncoder()
#     # Fit the encoder only on the training data
#     le.fit(train_df[col].astype(str))
#     # Transform both training and test data
#     train_df[col] = le.transform(train_df[col].astype(str))
#     # Ensure all possible labels are known by the encoder
#     test_df[col] = test_df[col].astype(str)
#     # Use a mapping to handle unseen labels in test data
#     test_df[col] = test_df[col].apply(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
#     label_encoders[col] = le

# # Handle numeric columns with non-numeric values
# def clean_numeric(value):
#     try:
#         return float(value.split()[0])
#     except ValueError:
#         return np.nan

# numeric_columns = ['some_numeric_column']  # Replace with actual numeric columns
# for col in numeric_columns:
#     train_df[col] = train_df[col].apply(clean_numeric)
#     test_df[col] = test_df[col].apply(clean_numeric)

# # Handle missing values if necessary
# train_df.fillna(0, inplace=True)
# test_df.fillna(0, inplace=True)

# # Prepare the data for training
# X = train_df.drop(columns=['entity_value'])
# y = train_df['entity_value']

# # Split the data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=44)

# # Initialize and train the model
# model = XGBRegressor()
# model.fit(X_train, y_train)

# # Step 3: Generate predictions
# X_test = test_df.drop(columns=['id'])  # Dropping 'id' if it's not a feature
# predictions = model.predict(X_test)

# # Convert predictions to a DataFrame
# predictions_df = pd.DataFrame(predictions, columns=['entity_value'])

# # Ensure predictions are in the same format as sample_test_out.csv
# # For this example, we'll assume predictions need to be formatted as strings
# predictions_df['entity_value'] = predictions_df['entity_value'].apply(lambda x: f"{x:.2f}")

# # Step 4: Prepare the submission file
# submission_df = test_df[['id']].copy()
# submission_df['entity_value'] = predictions_df['entity_value']
# submission_df.to_csv('test_out.csv', index=False)


In [55]:
# import pandas as pd

# def evaluate_f1_score(ground_truth_file, prediction_file):
#     # Load the ground truth and prediction files
#     gt_df = pd.read_csv(ground_truth_file)
#     pred_df = pd.read_csv(prediction_file)

#     # Ensure both files have the same key to merge on, e.g., 'index' or 'image_link'
#     common_key = 'index'  # Replace with the actual common key
    
#     # Merge ground truth and predictions based on the common key
#     merged_df = pd.merge(gt_df[[common_key, 'entity_value']], pred_df[[common_key, 'prediction']], on=common_key, how='inner')

#     # Initialize counts for TP, FP, FN, and TN
#     TP = FP = FN = TN = 0

#     # Iterate through each row to classify predictions
#     for _, row in merged_df.iterrows():
#         GT = str(row['entity_value']).strip()  # Ground Truth
#         OUT = str(row['prediction']).strip()   # Model Prediction

#         if OUT != "" and GT != "" and OUT == GT:
#             TP += 1  # True Positive
#         elif OUT != "" and GT != "" and OUT != GT:
#             FP += 1  # False Positive
#         elif OUT != "" and GT == "":
#             FP += 1  # False Positive (prediction when none expected)
#         elif OUT == "" and GT != "":
#             FN += 1  # False Negative
#         elif OUT == "" and GT == "":
#             TN += 1  # True Negative

#     # Calculate Precision, Recall, and F1 Score
#     precision = TP / (TP + FP) if (TP + FP) > 0 else 0
#     recall = TP / (TP + FN) if (TP + FN) > 0 else 0
#     f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

#     # Print the results
#     print(f"True Positives (TP): {TP}")
#     print(f"False Positives (FP): {FP}")
#     print(f"False Negatives (FN): {FN}")
#     print(f"True Negatives (TN): {TN}")
#     print(f"Precision: {precision:.4f}")
#     print(f"Recall: {recall:.4f}")
#     print(f"F1 Score: {f1_score:.4f}")

#     return f1_score

# if __name__ == "__main__":
#     ground_truth_file = 'test_with_entity_value.csv'  # Replace with your ground truth CSV file path
#     prediction_file = 'test_predictions.csv'  # Replace with your prediction CSV file path
    
#     # Evaluate the F1 score
#     evaluate_f1_score(ground_truth_file, prediction_file)


In [56]:
import pandas as pd

# Example: Convert a continuous target variable into binary categories
threshold = y.median()  # or any other threshold you consider
y_discrete = (y > threshold).astype(int)


In [57]:
from sklearn.preprocessing import LabelEncoder

# Example: Convert categorical labels to numeric labels if needed
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


In [58]:
import pandas as pd

# Check the unique values of y
print(pd.Series(y).unique())


[0.12882413 0.52344822 0.42215061 0.14721792 0.40611765 0.38759865
 0.12066371 0.15677678 0.43947654 0.12697942 0.38692401 0.90803841
 0.28431598 0.73101476 0.85954439 0.18634271 0.18785704 0.0529842
 0.96288263 0.61153526 0.67672449 0.29502269 0.13359286 0.31157161
 0.72290211 0.73105762 0.71815427 0.91806899 0.87112164 0.32324603
 0.43518786 0.61221582 0.64676943 0.67850408 0.43406493 0.40558937
 0.29730349 0.3666935  0.23297146 0.18516651 0.35183481 0.08419411
 0.71141078 0.65810203 0.57278899 0.06037512 0.82891957 0.67280528
 0.39047841 0.65109517 0.0063868  0.24970574 0.19143303 0.04331295
 0.03576852 0.38466137 0.31354373 0.0323947  0.15849831 0.59948263
 0.61521989 0.67874651 0.81342765 0.48107286 0.48860727 0.48891186
 0.75776506 0.09621606 0.44145686 0.92182007 0.93618834 0.74223622
 0.63576385 0.42004841 0.99090034 0.45072856 0.66177787 0.19229499
 0.40725282 0.72047005 0.36196141 0.09769261 0.5283242  0.02043664
 0.25870087 0.79198315 0.39701484 0.30210969 0.11051224 0.12265

In [59]:
# Example: Convert continuous target to binary classes
threshold = y.median()  # You can use a different threshold if needed
y_discrete = (y > threshold).astype(int)


In [60]:
import numpy as np
import pandas as pd

# Assuming y_train is your target variable
unique_classes = np.unique(y_train)
print(f"Unique classes in y_train: {unique_classes}")
print(f"Number of unique classes: {len(unique_classes)}")


Unique classes in y_train: [0.0063868  0.0323947  0.03576852 0.0529842  0.07984728 0.08419411
 0.09621606 0.09769261 0.11051224 0.12066371 0.12265242 0.12697942
 0.14721792 0.15677678 0.15849831 0.18634271 0.18785704 0.19143303
 0.22357445 0.23297146 0.24970574 0.2583229  0.25870087 0.29502269
 0.29730349 0.30210969 0.31157161 0.31354373 0.32324603 0.35183481
 0.3666935  0.38466137 0.38759865 0.39047841 0.39701484 0.40558937
 0.40725282 0.42215061 0.43406493 0.43947654 0.44145686 0.45072856
 0.48107286 0.48860727 0.48891186 0.52344822 0.5283242  0.56136355
 0.58795163 0.59948263 0.61153526 0.61508643 0.61521989 0.63576385
 0.64676943 0.65109517 0.65810203 0.67280528 0.67672449 0.67874651
 0.70198507 0.71141078 0.71815427 0.72047005 0.72290211 0.73101476
 0.73105762 0.74223622 0.75776506 0.79198315 0.81342765 0.82891957
 0.85954439 0.87112164 0.87508337 0.90803841 0.91806899 0.92024741
 0.92182007 0.99090034]
Number of unique classes: 80


In [61]:
from sklearn.model_selection import train_test_split

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Check class distribution
print("Training set class distribution:", pd.Series(y_train).value_counts())
print("Test set class distribution:", pd.Series(y_test).value_counts())


Training set class distribution: target
0.908038    1
0.701985    1
0.481073    1
0.599483    1
0.875083    1
           ..
0.122652    1
0.313544    1
0.223574    1
0.035769    1
0.249706    1
Name: count, Length: 70, dtype: int64
Test set class distribution: target
0.020437    1
0.043313    1
0.126979    1
0.258323    1
0.351835    1
0.186343    1
0.921820    1
0.711411    1
0.718154    1
0.110512    1
0.384661    1
0.612216    1
0.284316    1
0.192295    1
0.661778    1
0.406118    1
0.492443    1
0.678504    1
0.420048    1
0.435188    1
0.962883    1
0.128824    1
0.386924    1
0.361961    1
0.133593    1
0.185167    1
0.572789    1
0.060375    1
0.936188    1
0.635764    1
Name: count, dtype: int64


In [62]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Assuming X and y are your features and target
X_train, X_test, y_train, y_test = train_test_split(X, y_discrete, test_size=0.3, random_state=42)

# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train)

# Initialize and train the model
model = LogisticRegression(random_state=42)
model.fit(X_train_balanced, y_train_balanced)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))


[[ 8 10]
 [ 6  6]]
              precision    recall  f1-score   support

           0       0.57      0.44      0.50        18
           1       0.38      0.50      0.43        12

    accuracy                           0.47        30
   macro avg       0.47      0.47      0.46        30
weighted avg       0.49      0.47      0.47        30



In [63]:
import pandas as pd
import numpy as np
from sklearn.metrics import precision_score, recall_score, f1_score

# Define your custom evaluation metrics
def compute_custom_metrics(y_true, y_pred):
    tp = np.sum((y_pred != "") & (y_true != "") & (y_pred == y_true))
    fp = np.sum((y_pred != "") & (y_true != "") & (y_pred != y_true))
    fn = np.sum((y_pred == "") & (y_true != ""))
    tn = np.sum((y_pred == "") & (y_true == ""))
    
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0
    recall = tp / (tp + fn) if (tp + fn) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    
    return precision, recall, f1

# Example usage with your predictions
precision, recall, f1 = compute_custom_metrics(y_test, y_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

# Prepare and save the submission file
submission_df = pd.DataFrame({
    'Index': X_test.index,  # Make sure to use the actual index column from your dataset
    'Prediction': y_pred
})

submission_df.to_csv('test_out.csv', index=False)


Precision: 0.4667
Recall: 1.0000
F1 Score: 0.6364
