Hello!
It's my first notebook on kaggle and I will be glad to receive any feedback.

In [None]:
# import libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings("ignore")

In [None]:
# loading data

train = pd.read_csv('/kaggle/input/social-media-usage-and-emotional-well-being/train.csv')
test = pd.read_csv('/kaggle/input/social-media-usage-and-emotional-well-being/test.csv')

In [None]:
test.shape

In [None]:
train.shape

In [None]:
# let's look at the contents of the dataset

train.head()

In [None]:
test.head()

In [None]:
train.columns

In [None]:
# dataset has mismatching values for columns 'Gender' and 'Age'. Let's change it. 

train['Gender'].unique()

In [None]:
def clean_column_gender(col):
    try:
        int(col)
        return 'Uncertain'
    except:
        return col

train['Gender'] = train['Gender'].apply(clean_column_gender)
train['Gender'] = train['Gender'].fillna('Uncertain')


test['Gender'] = test['Gender'].apply(clean_column_gender)
test['Gender'] = test['Gender'].fillna('Uncertain')

In [None]:
train.isnull().sum()

In [None]:
train = train.fillna(0)
test = test.fillna(0)

In [None]:
train['Gender'].unique()

In [None]:
def clean_column_age(col):
    try:
        int(col)
        return int(col)
    except:
        return 0

train['Age'] = train['Age'].apply(clean_column_age)
train['Age'] = train['Age'].fillna(0)

test['Age'] = test['Age'].apply(clean_column_age)
test['Age'] = test['Age'].fillna(0)

In [None]:
train['Age'].unique()

In [None]:
# Let's look at visualization 'Gender' vs 'Dominant_Emotion'

sns.countplot(x='Dominant_Emotion', hue='Gender', data=train, palette="husl")

In [None]:
# visualization 'Likes_Received_Per_Day' vs 'Daily_Usage_Time (minutes)' depending on 'Dominant_Emotion'

sns.scatterplot(train, x='Likes_Received_Per_Day', y='Daily_Usage_Time (minutes)', hue='Dominant_Emotion', palette="husl", sizes=5)

In [None]:
# visualization 'Age' vs 'Gender'

plt.figure(figsize=(10, 6))
boxplot = sns.histplot(train, x='Age', hue='Gender', palette='husl', multiple="stack")
boxplot.set_xlim(20, 35)

In [None]:
# preparation for models

train['Gender'] = train['Gender'].map({'Uncertain': 0, 'Female': 1, 'Male': 2, 'Non-binary':3})
test['Gender'] = test['Gender'].map({'Uncertain': 0, 'Female': 1, 'Male': 2, 'Non-binary':3}, na_action='ignore')
test['Gender'] = test['Gender'].fillna(0)

In [None]:
train['Dominant_Emotion'].unique()

In [None]:
test['Dominant_Emotion'].unique()

In [None]:
train['Dominant_Emotion'] = train['Dominant_Emotion'].map({'Happiness': 0, 'Anger': 1, 'Neutral': 2, 'Anxiety': 3, 'Boredom': 4, 'Sadness': 5, 0: 0})
test['Dominant_Emotion'] = test['Dominant_Emotion'].map({'Happiness': 0, 'Anger': 1, 'Neutral': 2, 'Anxiety': 3, 'Boredom': 4, 'Sadness': 5})

In [None]:
# let's split the data

x_train = train.drop(['Dominant_Emotion', 'User_ID', 'Platform'], axis=1)
y_train = train['Dominant_Emotion']

x_test = test.drop(['Dominant_Emotion', 'User_ID', 'Platform'], axis=1)
y_test = test['Dominant_Emotion']

In [None]:
# let's make gradient boosting model for predicting 'Dominant_Emotion'

boost_params = {
    'n_estimators': [500,1000],
    'learning_rate': [0.02, 0.05],
    'max_depth': [1, 2],
    'min_samples_leaf': [5,10],
    'min_samples_split': [5,10]
}

boost_model = GradientBoostingRegressor()

In [None]:
boost_grid = GridSearchCV(boost_model, boost_params, cv=3, n_jobs=-1, error_score='raise')
boost_grid.fit(x_train, y_train)
y_pred = boost_grid.predict(x_test)
mean_squared_error(y_test, y_pred)

In [None]:
# let's make k-neighbors model for predicting 'Dominant_Emotion'

knn_params = {"knn__n_neighbors": range(1, 100)}
knn_pipe = Pipeline([("scaler", StandardScaler()), ("knn", KNeighborsRegressor(n_jobs=-1))])
knn_grid = GridSearchCV(knn_pipe, knn_params, cv=5, n_jobs=-1, verbose=True)
knn_grid.fit(x_train, y_train)
y_pred_k = knn_grid.predict(x_test)
mean_squared_error(y_test, y_pred_k)