<a href="https://colab.research.google.com/github/yashphulfagar/MLNotebooks/blob/main/MLPContest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Load the data
train_data = pd.read_csv('train.csv')


train_data.head()

Unnamed: 0,ID,RecipeNumber,RecipeCode,RecipeName,CommentID,UserID,UserName,UserReputation,CreationTimestamp,ReplyCount,ThumbsUpCount,ThumbsDownCount,Rating,BestScore,Recipe_Review
0,70,71,12259,Banana Bars with Cream Cheese Frosting,sp_aUSaElGf_12259_c_149978,u_1oKVaBmiEKwAFyiHrDLoWpbel0J,misscoffeepot,20,1622717078,0,0,0,5,100,great recipe! I have made them just as written...
1,72,88,8202,Simple Taco Soup,sp_aUSaElGf_8202_c_310332,u_1oKZeRIXYzGNhGW8OMR3O4lX1ih,MichelleMalone,1,1622717852,0,0,1,5,100,This is an easy and quick recipe that is great...
2,458,3,2832,Cheeseburger Soup,sp_aUSaElGf_2832_c_206522,u_1oKYHUtdaD48KJCDYq5wfpAUxWz,jeannem32,1,1622717357,0,0,0,3,100,I think I was looking for something extra spec...
3,7,50,100276,Grilled Huli Huli Chicken,sp_aUSaElGf_100276_c_434088,u_1oKZCQcKQZdQhDVBS7oISc216VA,ds422,1,1622648899,0,0,0,5,100,This is our favorite grilled chicken recipe. ...
4,60,12,19731,Cauliflower Soup,sp_aUSaElGf_19731_c_387011,u_1oKd4sudZQPgu6YALiC7bW7mKMs,SusanHoward,1,1622718260,0,0,0,4,100,Great basic recipe. I added 2 teaspoons Tuscan...


In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report


In [6]:
# Features: Numerical columns
numerical_features = train_data.drop(['Rating', 'Recipe_Review', 'RecipeName', 'CommentID', 'UserID', 'UserName'], axis=1)

# Target variable
y = train_data['Rating']

# Text features: 'Recipe_Review'
text_features = train_data['Recipe_Review']

In [7]:
# Use TF-IDF for text feature representation
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)

# Handle missing values in 'Recipe_Review'
text_features = text_features.fillna('')

# Transform text features using TF-IDF
text_features_tfidf = tfidf_vectorizer.fit_transform(text_features)

# Combine numerical and text features
X = pd.concat([numerical_features.reset_index(drop=True), pd.DataFrame(text_features_tfidf.toarray())], axis=1)

# Convert column names to strings for all features
X.columns = X.columns.astype(str)


In [8]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the RandomForest model with class weights
class_weights = {0: 0.1, 1: 0.2, 2: 0.2, 3: 0.1, 4: 0.1, 5: 0.3}  # Adjust weights as needed
rf_model_weighted = RandomForestClassifier(class_weight=class_weights)

# Train the model
rf_model_weighted.fit(X_train, y_train)

# Make predictions
rf_predictions_weighted = rf_model_weighted.predict(X_test)

# Evaluate the model with class weights
accuracy_weighted = accuracy_score(y_test, rf_predictions_weighted)
report_weighted = classification_report(y_test, rf_predictions_weighted)

print(f'Random Forest (Weighted) Accuracy: {accuracy_weighted}')
print('Random Forest (Weighted) Classification Report:\n', report_weighted)


Random Forest (Weighted) Accuracy: 0.7642961876832844
Random Forest (Weighted) Classification Report:
               precision    recall  f1-score   support

           0       0.50      0.07      0.12       257
           1       0.60      0.22      0.32        41
           2       0.50      0.03      0.06        34
           3       0.60      0.11      0.18        85
           4       0.35      0.02      0.04       250
           5       0.77      0.99      0.87      2061

    accuracy                           0.76      2728
   macro avg       0.55      0.24      0.26      2728
weighted avg       0.70      0.76      0.68      2728



In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier

# Define the hyperparameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5]
}

# Initialize the XGBoost model
xgb_model_tuned = XGBClassifier()

# Initialize Grid Search with 5-fold cross-validation
grid_search = GridSearchCV(xgb_model_tuned, param_grid, cv=5, scoring='accuracy')

# Perform Grid Search
grid_search.fit(X_train, y_train)

# Get the best parameters from Grid Search
best_params = grid_search.best_params_

# Train the model with the best parameters
xgb_model_best = XGBClassifier(**best_params)
xgb_model_best.fit(X_train, y_train)

# Make predictions with the best model
xgb_predictions_best = xgb_model_best.predict(X_test)

# Evaluate the best model
accuracy_best = accuracy_score(y_test, xgb_predictions_best)
report_best = classification_report(y_test, xgb_predictions_best)

print(f'XGBoost (Tuned) Accuracy: {accuracy_best}')
print('XGBoost (Tuned) Classification Report:\n', report_best)
print('Best Hyperparameters:', best_params)