In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer


load dataset

In [3]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [6]:
test_ids = test_data['ID']

In [8]:
train_data.fillna('missing', inplace=True)
test_data.fillna('missing', inplace=True)

In [10]:
X = train_data.drop(columns=['matched_score'])
y = train_data['matched_score']

In [14]:
combined_data = pd.concat([X, test_data.drop(columns=['ID'])], axis=0)

In [16]:
tfidf = TfidfVectorizer(max_features=100, stop_words='english')
skills_tfidf = tfidf.fit_transform(combined_data['skills'].fillna('missing')).toarray()


In [18]:
tfidf_df = pd.DataFrame(skills_tfidf, columns=[f'skill_{i}' for i in range(skills_tfidf.shape[1])])
combined_data = pd.concat([combined_data.reset_index(drop=True), tfidf_df], axis=1)


In [20]:
combined_data.drop(columns=['skills'], inplace=True)

In [22]:
label_encoders = {}
for col in combined_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    combined_data[col] = le.fit_transform(combined_data[col])
    label_encoders[col] = le


In [24]:
X = combined_data.iloc[:len(X), :]
test_data = combined_data.iloc[len(X):, :]

In [26]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)

In [30]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10]
}

In [None]:
rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

best_model = grid_search.best_estimator_
