# Install

In [151]:
%pip install -qU pandas numpy scikit-learn lightgbm tqdm

Note: you may need to restart the kernel to use updated packages.


# Task 1

1. Counting islands. Classical Algorithms\
You have a matrix MxN that represents a map. There are 2 possible states on the map:\
1 - islands, 0 - the ocean. Your task is to calculate the number of islands in the most\
effective way. Please write code in Python 3 and provide a github repository with a\
solution.\
Take care that you clearly understand test cases before starting implementation.

Inputs:\
M N

Matrix\
Test cases:\
Input:\
3 3\
0 1 0\
0 0 0\
0 1 1

Output: 2\
Input:\
3 4\
0 0 0 1\
0 0 1 0\
0 1 0 0

Output: 3\
Input:\
3 4\
0 0 0 1\
0 0 1 1\
0 1 0 1\
Output: 2

In [13]:
def dfs(matrix, visited, row, col, directions):
    rows = len(matrix)
    cols = len(matrix[0])
    stack = [(row, col)]
    
    while stack:
        current_row, current_col = stack.pop()
        visited[current_row][current_col] = True

        for delta_row, delta_column in directions:
            r, c = current_row + delta_row, current_col + delta_column
            if 0 <= r < rows and 0 <= c < cols and not visited[r][c] and matrix[r][c] == 1:
                stack.append((r, c))

def count_islands(matrix):
    if not matrix:
        return 0

    rows = len(matrix)
    cols = len(matrix[0])
    visited = [[False for _ in range(cols)] for _ in range(rows)]
    island_count = 0
    directions = [(-1, 0), (1, 0), (0, -1), (0, 1)]

    for row in range(rows):
        for col in range(cols):
            if matrix[row][col] == 1 and not visited[row][col]:
                dfs(matrix, visited, row, col, directions)
                island_count += 1

    return island_count

In [14]:
test_cases = [
    [[0, 1, 0], 
     [0, 0, 0], 
     [0, 1, 1]],     #2
    
    [[0, 0, 0, 1], 
     [0, 0, 1, 0], 
     [0, 1, 0, 0]],  #3
    
    [[0, 0, 0, 1], 
     [0, 0, 1, 1], 
     [0, 1, 0, 1]]   #2
]

for idx, matrix in enumerate(test_cases):
    print(f"Test number {idx + 1}: {count_islands(matrix)} islands \n")


Test number 1: 2 islands 

Test number 2: 3 islands 

Test number 3: 2 islands 



# Task 2

2. Regression on the tabular data. General Machine Learning\
You have a dataset (train.csv) that contains 53 anonymized features and a target\
column. Your task is to build a model that predicts a target based on the proposed\
features. Please provide predictions for the hidden_test.csv file. Target metric is RMSE.\
The main goal is to provide github repository that contains:\
● jupyter notebook with exploratory data analysis;\
● train.py python script for model training;\
● predict.py python script for model inference on test data;

● file with prediction results;\
● readme file that contains instructions about project setup and general guidance around project;\
● requirements.txt file.\
Please provide documented code. Scripts (train.py and predict.py) should be able\
to be executed from the terminal.

In [198]:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin

In [268]:
train_df = pd.read_csv("./train.csv")
test_df = pd.read_csv("./hidden_test.csv")

In [41]:
pd.set_option('display.max_columns', 55)
train_df.describe(include='all')

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,target
count,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0,90000.0
mean,249.423944,250.236267,248.637289,249.7366,249.436178,249.656167,-0.011402,0.498548,0.499189,249.842033,249.818967,249.346778,249.8999,7.48394,7.498037,7.496259,7.501635,7.486463,7.523043,7.475585,7.494835,7.503939,7.507724,7.508681,7.472421,7.519205,7.515753,7.509402,7.496781,7.504014,7.494212,7.490808,7.486936,7.514508,7.491614,7.520193,7.507801,7.493713,7.51384,7.507033,7.479579,7.519826,7.502483,7.512965,7.475155,7.523962,7.508397,7.473322,7.490658,7.474578,7.509206,7.487159,7.513316,50.033981
std,144.336393,144.0222,144.107577,144.284945,143.941581,144.329168,7.038171,0.288682,0.500002,144.612718,144.363827,144.476128,144.310495,4.330908,4.318388,4.338732,4.345618,4.318031,4.3235,4.327327,4.325447,4.330159,4.332599,4.328966,4.345228,4.339104,4.333907,4.331007,4.334982,4.318553,4.327286,4.322672,4.329524,4.329973,4.333144,4.335244,4.329515,4.32748,4.333543,4.32767,4.326175,4.315565,4.333881,4.329929,4.33041,4.321537,4.331761,4.335692,4.332122,4.323035,4.326364,4.324876,4.33308,28.897243
min,0.0,0.0,0.0,0.0,0.0,0.0,-9.949874,1.4e-05,0.0,0.0,0.0,0.0,0.0,9.5e-05,0.000252,0.00019,0.000192,6.7e-05,0.000229,0.000125,0.000382,0.000398,0.00014,0.000382,0.000129,0.000192,1.7e-05,8e-05,0.000161,0.000437,4.1e-05,7e-05,7.4e-05,0.000111,6.7e-05,7.8e-05,1.8e-05,9.4e-05,2e-05,0.000106,0.000105,0.000273,0.000315,6e-05,1.9e-05,4e-05,0.000154,8.3e-05,0.000367,1.4e-05,0.00016,0.000147,0.000125,0.002634
25%,125.0,126.0,124.0,125.0,125.0,124.0,-7.071068,0.248932,0.0,124.0,126.0,124.0,125.0,3.723392,3.763626,3.718755,3.719537,3.758746,3.778857,3.73378,3.748951,3.760261,3.74281,3.768145,3.68514,3.76079,3.746467,3.772795,3.741697,3.774239,3.743116,3.763061,3.750034,3.778028,3.743842,3.752848,3.776405,3.755325,3.754465,3.771945,3.731527,3.784809,3.763004,3.768044,3.707544,3.797002,3.760627,3.715721,3.739358,3.715298,3.773381,3.743536,3.776322,25.091903
50%,250.0,251.0,248.0,250.0,250.0,250.0,0.0,0.497136,0.0,250.0,250.0,249.0,251.0,7.483265,7.505267,7.502682,7.51524,7.467721,7.539462,7.460267,7.4815,7.521563,7.520793,7.520739,7.484349,7.526894,7.528984,7.494876,7.491009,7.500083,7.514475,7.483334,7.458614,7.513794,7.475324,7.550623,7.484699,7.472789,7.518614,7.504101,7.452493,7.532363,7.511806,7.507058,7.474127,7.533987,7.505259,7.459774,7.494167,7.47727,7.512575,7.476564,7.506812,50.030705
75%,374.0,375.0,374.0,375.0,373.0,374.0,7.0,0.747513,1.0,376.0,375.0,375.0,375.0,11.242883,11.228331,11.258049,11.284618,11.238904,11.254751,11.211187,11.238128,11.248749,11.244657,11.240915,11.239795,11.276738,11.272683,11.271486,11.257988,11.233189,11.223296,11.229486,11.245719,11.272623,11.241922,11.261178,11.255442,11.251097,11.270277,11.241504,11.222543,11.241135,11.255483,11.272459,11.216585,11.276349,11.261971,11.215637,11.239232,11.21007,11.268156,11.234414,11.277835,75.059454
max,499.0,499.0,499.0,499.0,499.0,499.0,9.949874,0.999987,1.0,499.0,499.0,499.0,499.0,14.99992,14.999938,14.99966,14.999979,14.99984,14.999857,14.999634,14.999953,14.999483,14.999762,14.999875,14.999987,14.999651,14.999995,14.999838,14.999958,14.999841,14.999808,14.999927,14.999896,14.999871,14.999558,14.999898,14.999997,14.999822,14.999958,14.999933,14.999704,14.99994,14.999739,14.999605,14.9999,14.999528,14.999733,14.999478,14.999869,14.999928,14.999948,14.999364,14.999775,99.999482


In [189]:
corr_matrix_train_df = train_df.corr()
corr_values = corr_matrix_train_df['target'].abs().sort_values(ascending=False)
corr_values

target    1.000000
7         0.012103
40        0.007216
26        0.006370
39        0.006283
35        0.006267
1         0.005545
50        0.005243
31        0.005077
29        0.004785
51        0.004507
25        0.004276
38        0.003845
41        0.003770
28        0.003451
10        0.003396
44        0.003092
17        0.002953
42        0.002876
21        0.002874
3         0.002807
36        0.002624
49        0.002473
0         0.002427
33        0.002301
5         0.002125
12        0.002016
15        0.001653
37        0.001642
11        0.001566
19        0.001520
9         0.001504
27        0.001459
34        0.001331
4         0.001300
23        0.001280
52        0.001203
18        0.001197
48        0.001183
46        0.001160
24        0.001070
13        0.001009
2         0.001003
30        0.000964
45        0.000903
16        0.000853
6         0.000666
32        0.000561
14        0.000513
43        0.000489
20        0.000456
8         0.000347
47        0.

In [267]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, Lasso
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, PolynomialFeatures
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.decomposition import PCA
from math import sqrt
import lightgbm as lgb
from scipy.stats import randint as sp_randint, uniform as sp_uniform


# Pipeline
class CorrelationFilter(BaseEstimator, TransformerMixin):
    def __init__(self, threshold):
        self.threshold = threshold
        self.selected_columns = None
    
    def fit(self, X, y):
        correlations = np.abs([np.corrcoef(X[:, i], y)[0, 1] for i in range(X.shape[1])])
        self.selected_columns = np.where(correlations >= self.threshold)[0]
        return self
    
    def transform(self, X):
        return X[:, self.selected_columns]

X = train_df.drop(columns=['target'])
y = train_df['target']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

normalize_columns = [7, 8]
standardize_columns = [int(x) for x in list(set(np.arange(0, len(X_train.columns)))\
                                            .difference(set(normalize_columns)))]
preprocessor = ColumnTransformer(
    transformers=[
        ('standardize', StandardScaler(), standardize_columns),
        ('normalize', MinMaxScaler(), normalize_columns)
    ],
    remainder='passthrough'
)

poly_features = PolynomialFeatures(degree=2, include_bias=False)
pca = PCA(n_components=0.95)


# LightGBM
pipeline_lgb = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('polynomial', poly_features),
    ('correlation_filter', CorrelationFilter(threshold=0.003)),
    ('pca', pca),
    ('regressor', lgb.LGBMRegressor(random_state=42, verbosity=-1))
])

lgb_params = {
    'regressor__n_estimators': sp_randint(250, 350),
    'regressor__max_depth': sp_randint(10, 25),
    'regressor__learning_rate': sp_uniform(0.07, 0.2),
    'regressor__num_leaves': sp_randint(10, 30),
    'regressor__min_child_samples': sp_randint(25, 35),
    'regressor__bagging_fraction': sp_uniform(0.1, 0.9),
    'regressor__colsample_bytree': sp_uniform(0.6, 0.4)
}


lgb_random_search = RandomizedSearchCV(pipeline_lgb, 
                                       lgb_params, 
                                       n_iter=10, 
                                       cv=5,
                                       scoring='neg_mean_squared_error', 
                                       n_jobs=10, 
                                       random_state=42,
                                       verbose=0)

lgb_random_search.fit(X_train, y_train)
best_lgb = lgb_random_search.best_estimator_
y_train_pred_lgb = best_lgb.predict(X_train)
y_val_pred_lgb = best_lgb.predict(X_val)
rmse_train_lgb = sqrt(mean_squared_error(y_train, y_train_pred_lgb))
rmse_val_lgb = sqrt(mean_squared_error(y_val, y_val_pred_lgb))
print(f'Best params LightGBM: {lgb_random_search.best_params_}')
print(f'RMSE train LightGBM: {rmse_train_lgb}')
print(f'RMSE val LightGBM: {rmse_val_lgb}')



# {
#     'regressor__bagging_fraction': np.float64(0.8739463660626885), 
#     'regressor__colsample_bytree': np.float64(0.8721230154351118), 
#     'regressor__learning_rate': np.float64(0.16009985039390862), 
#     'regressor__max_depth': 11, 
#     'regressor__min_child_samples': 28, 
#     'regressor__n_estimators': 338, '
#     regressor__num_leaves': 23
# # }
# RMSE train LightGBM: 16.453558818693924
# RMSE val LightGBM: 21.04754117927409


# Lasso
pipeline_lasso = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('polynomial', poly_features),
    ('correlation_filter', CorrelationFilter(threshold=0.003)),
    ('pca', pca),
    ('regressor', Lasso())
])

lasso_params = {
    'regressor__alpha': [0.05, 0.1, 0.2, 0.3]
}

lasso_grid_search = GridSearchCV(pipeline_lasso, lasso_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
lasso_grid_search.fit(X_train, y_train)

best_lasso = lasso_grid_search.best_estimator_
y_train_pred_lasso = best_lasso.predict(X_train)
y_val_pred_lasso = best_lasso.predict(X_val)
rmse_train_lasso = sqrt(mean_squared_error(y_train, y_train_pred_lasso))
rmse_val_lasso = sqrt(mean_squared_error(y_val, y_val_pred_lasso))
print(f'Best params Lasso: {lasso_grid_search.best_params_}')
print(f'RMSE train Lasso: {rmse_train_lasso}')
print(f'RMSE val Lasso: {rmse_val_lasso}')


# Ridge
pipeline_ridge = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('polynomial', poly_features),
    ('correlation_filter', CorrelationFilter(threshold=0.003)),
    ('pca', pca),
    ('regressor', Ridge())
])

ridge_params = {
    'regressor__alpha': [50, 100, 150, 200]
}

ridge_grid_search = GridSearchCV(pipeline_ridge, ridge_params, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
ridge_grid_search.fit(X_train, y_train)

best_ridge = ridge_grid_search.best_estimator_
y_train_pred_ridge = best_ridge.predict(X_train)
y_val_pred_ridge = best_ridge.predict(X_val)
rmse_train_ridge = sqrt(mean_squared_error(y_train, y_train_pred_ridge))
rmse_val_ridge = sqrt(mean_squared_error(y_val, y_val_pred_ridge))
print(f'Best params Ridge: {ridge_grid_search.best_params_}')
print(f'RMSE train Ridge: {rmse_train_ridge}')
print(f'RMSE val Ridge: {rmse_val_ridge}')

Search Progress:  35%|███████████████████████████████████████▏                                                                        | 7/20 [28:02:15<52:04:12, 14419.41s/it]
Search Progress:  20%|██████████████████████▍                                                                                         | 4/20 [20:26:53<81:47:32, 18403.27s/it]


Best params LightGBM: {'regressor__bagging_fraction': np.float64(0.8739463660626885), 'regressor__colsample_bytree': np.float64(0.8721230154351118), 'regressor__learning_rate': np.float64(0.16009985039390862), 'regressor__max_depth': 11, 'regressor__min_child_samples': 28, 'regressor__n_estimators': 338, 'regressor__num_leaves': 23}
RMSE train LightGBM: 16.453558818693924
RMSE val LightGBM: 21.04754117927409
Best params Lasso: {'regressor__alpha': 0.2}
RMSE train Lasso: 28.516372554777824
RMSE val Lasso: 28.94340572465319
Best params Ridge: {'regressor__alpha': 200}
RMSE train Ridge: 28.29347845766259
RMSE val Ridge: 29.12257179083495
