In [110]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, PolynomialFeatures
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

#### Loading Dataset

In [113]:
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [115]:
test_ids = test_data['ID']

#### Handling missing values
Missing values are filled with the string 'missing' to ensure compatibility with text processing steps like TF-IDF.

In [118]:

train_data.fillna('missing', inplace=True)
test_data.fillna('missing', inplace=True)

#### Dropping columns

In [121]:

train_data.drop(columns=['address', 'passing_years', 'company_urls', 'extra_curricular_organization_links', 'online_links', 'issue_dates','extra_curricular_activity_types','extra_curricular_organization_names', 'expiry_dates','role_positions','proficiency_levels'], inplace=True)
test_data.drop(columns=['address', 'passing_years', 'company_urls', 'extra_curricular_organization_links', 'online_links', 'issue_dates','extra_curricular_activity_types','extra_curricular_organization_names', 'expiry_dates','role_positions','proficiency_levels'], inplace=True)

#### Separate features and target

In [124]:

X = train_data.drop(columns=['matched_score'])
y = train_data['matched_score']
combined_data = pd.concat([X, test_data.drop(columns=['ID'])], axis=0)

#### TF-IDF Vectorization for text columns

 TF-IDF parameters:
    - max_features=500: Limits the maximum number of features (most frequent terms) to 500 to reduce dimensionality.
    - stop_words='english': Removes common English stop words to improve meaningful feature extraction.
    - ngram_range=(1, 2): Considers unigrams and bigrams to capture more context from the text.

In [127]:

text_columns = [
    'skills', 'career_objective', 'educational_institution_name',
    'related_skils_in_job', 'responsibilities', 'skills_required',
     'degree_names', 'major_field_of_studies',
    'professional_company_names', 'positions', 'locations',
     'languages', 'certification_providers',
    'certification_skills', '﻿job_position_name'
]
for text_col in text_columns:
    tfidf = TfidfVectorizer(max_features=500, stop_words='english', ngram_range=(1, 2))
    tfidf_matrix = tfidf.fit_transform(combined_data[text_col].fillna('missing')).toarray()
    tfidf_df = pd.DataFrame(tfidf_matrix, columns=[f'{text_col}_{i}' for i in range(tfidf_matrix.shape[1])])
    combined_data = pd.concat([combined_data.reset_index(drop=True), tfidf_df], axis=1)
    combined_data.drop(columns=[text_col], inplace=True)


#### Encode categorical features

In [130]:

label_encoders = {}
for col in combined_data.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    combined_data[col] = le.fit_transform(combined_data[col])
    label_encoders[col] = le


#### Scale numerical features

In [133]:

scaler = StandardScaler()
numerical_cols = combined_data.select_dtypes(include=['float64', 'int64']).columns
combined_data[numerical_cols] = scaler.fit_transform(combined_data[numerical_cols])

#### Split back into training and testing sets

In [136]:

X = combined_data.iloc[:len(X), :]
test_data_processed = combined_data.iloc[len(X):, :]

#### Feature selection using LightGBM

In [139]:

lgb_temp = lgb.LGBMRegressor(random_state=42, n_estimators=100)
lgb_temp.fit(X, y)
selector = SelectFromModel(lgb_temp, prefit=True, threshold='median')
X_selected = selector.transform(X)
test_data_selected = selector.transform(test_data_processed)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.049286 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 35795
[LightGBM] [Info] Number of data points in the train set: 7635, number of used features: 5566
[LightGBM] [Info] Start training from score 0.660667




#### K-Fold Cross Validation

In [142]:

kf = KFold(n_splits=5, shuffle=True, random_state=42)
oof_predictions = np.zeros(len(y))
test_predictions = np.zeros(test_data_selected.shape[0])
for train_index, valid_index in kf.split(X_selected):
    X_train, X_valid = X_selected[train_index], X_selected[valid_index]
    y_train, y_valid = y[train_index], y[valid_index]

    # Train LightGBM model
    lgb_model = lgb.LGBMRegressor(
        random_state=42, learning_rate=0.02, n_estimators=1000, num_leaves=60,
        max_depth=-1, min_child_samples=25, subsample=0.75, colsample_bytree=0.75,
        reg_alpha=0.3, reg_lambda=0.3
    )
    lgb_model.fit(X_train, y_train, eval_set=[(X_valid, y_valid)],
                 eval_metric='rmse',
                 callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=0)])

    # Validate the model
    oof_predictions[valid_index] = lgb_model.predict(X_valid)
    test_predictions += lgb_model.predict(test_data_selected) / kf.n_splits

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.032694 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29692
[LightGBM] [Info] Number of data points in the train set: 6108, number of used features: 3535
[LightGBM] [Info] Start training from score 0.658422
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.034260 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29692
[LightGBM] [Info] Number of data points in the train set: 6108, number of used features: 3535
[LightGBM] [Info] Start training from score 0.661442
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.033334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 29676
[LightGBM] [Info] Number of data points in the train set: 6108, number of used features: 3534
[LightGBM] [Info] Sta

#### Evaluate performance

In [144]:

mse = mean_squared_error(y, oof_predictions)
r2 = r2_score(y, oof_predictions)
print(f"Validation Mean Squared Error: {mse}")
print(f"Validation R2 Score: {r2}")


Validation Mean Squared Error: 0.008360897010833354
Validation R2 Score: 0.701616971211891


### Create final CSV file for submission

In [147]:
submission = pd.DataFrame({'ID': test_ids, 'matched_score': test_predictions})
submission.to_csv('submission.csv', index=False)