In [1]:
import pandas as pd
import numpy as np
import base64
import cv2
import catboost
from catboost import CatBoostRegressor
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge
from tqdm import tqdm
import random
from sklearn.model_selection import KFold
import base64
import pickle
tqdm.pandas()

import base64
import warnings

import numpy as np
import pandas as pd

from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split

from PIL import Image
from io import BytesIO
import base64
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
def set_seed(seed=42):
    np.random.seed(seed)
    random.seed(seed)

set_seed()

In [3]:
def decode_image(base64_str):
    try:
        img_data = base64.b64decode(base64_str)
        np_arr = np.frombuffer(img_data, np.uint8)
        img = cv2.imdecode(np_arr, cv2.IMREAD_COLOR)
        if img is not None:
            return img
    except:
        pass
    return np.zeros((100, 100, 3), dtype=np.uint8)

In [23]:
from skimage.feature import graycomatrix, graycoprops
import string

# def extract_image_features(df):
#     img_sizes = []
#     img_means = []
#     img_stds = []
#     # img_min = []
#     # img_max = []
#     for img_b64 in tqdm(df['photo'].fillna("")):
#         img = decode_image(img_b64)
#         img_sizes.append(img.shape[0] * img.shape[1])
#         img_means.append(img.mean())
#         img_stds.append(img.std())
#         # img_min.append(img.min())
#         # img_max.append(img.max())
#     df['img_size'] = img_sizes
#     df['img_mean'] = img_means
#     df['img_std'] = img_stds
#     # df['img_min'] = img_min
#     # df['img_max'] = img_max
#     df.drop(columns=['photo'], inplace=True)
#     return df

def extract_image_features(df):
    img_sizes = []
    img_means = []
    img_stds = []
    img_mins = []
    img_maxs = []
    img_entropies = []
    
    glcm_contrasts = []
    glcm_dissimilarities = []
    glcm_homogeneities = []
    glcm_energies = []
    glcm_correlations = []
    glcm_ASMs = []
    
    for img_b64 in tqdm(df['photo'].fillna("")):
        img = decode_image(img_b64)
        
        # If image is color, convert to grayscale for GLCM
        if len(img.shape) == 3 and img.shape[2] == 3:
            # Convert to grayscale if needed (example with np.dot or cv2):
            # img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
            img_gray = np.dot(img[...,:3], [0.2989, 0.5870, 0.1140]).astype(np.uint8)
        else:
            # Already grayscale
            img_gray = img.astype(np.uint8)
        
        # 1. Basic pixel intensity features
        img_sizes.append(img.shape[0] * img.shape[1])
        img_means.append(img.mean())
        img_stds.append(img.std())
        img_mins.append(img.min())
        img_maxs.append(img.max())
        
        # 2. Entropy (classic definition using histogram)
        hist, _ = np.histogram(img.ravel(), bins=256, range=(0, 256))
        hist = hist[hist > 0]
        probabilities = hist / hist.sum()
        entropy = -np.sum(probabilities * np.log2(probabilities))
        img_entropies.append(entropy)
        
        # 3. GLCM-based texture features (using angles 0, 45, 90, 135 degrees)
        glcm = graycomatrix(
            img_gray, 
            distances=[1], 
            angles=[0, np.pi/4, np.pi/2, 3*np.pi/4],
            levels=256, 
            symmetric=True, 
            normed=True
        )
        
        contrast = graycoprops(glcm, 'contrast').mean()
        dissimilarity = graycoprops(glcm, 'dissimilarity').mean()
        homogeneity = graycoprops(glcm, 'homogeneity').mean()
        energy = graycoprops(glcm, 'energy').mean()
        correlation = graycoprops(glcm, 'correlation').mean()
        ASM = graycoprops(glcm, 'ASM').mean()
        
        glcm_contrasts.append(contrast)
        glcm_dissimilarities.append(dissimilarity)
        glcm_homogeneities.append(homogeneity)
        glcm_energies.append(energy)
        glcm_correlations.append(correlation)
        glcm_ASMs.append(ASM)
    
    df['img_size'] = img_sizes
    df['brightness'] = img_means
    df['contrast_std'] = img_stds
    df['img_min'] = img_mins
    df['img_max'] = img_maxs
    df['contrast_range'] = df['img_max'] - df['img_min']
    df['coefficient_of_variation'] = df['contrast_std'] / (df['brightness'] + 1e-9)
    df['entropy'] = img_entropies
    
    # Add GLCM-based texture features
    df['glcm_contrast'] = glcm_contrasts
    df['glcm_dissimilarity'] = glcm_dissimilarities
    df['glcm_homogeneity'] = glcm_homogeneities
    df['glcm_energy'] = glcm_energies
    df['glcm_correlation'] = glcm_correlations
    df['glcm_asm'] = glcm_ASMs
    
    # Drop the raw 'photo' after feature extraction
    df.drop(columns=['photo'], inplace=True)
    
    return df

In [24]:
import string
import re

def extract_text_features(df):
    df['text'] = df['text'].fillna('')
    df['text_length'] = df['text'].apply(len)
    df['word_count'] = df['text'].apply(lambda x: len(x.split()))
    
    df['question_marks'] = df['text'].apply(lambda x: x.count('?'))
    df['exclamation_marks'] = df['text'].apply(lambda x: x.count('!'))
    df['periods'] = df['text'].apply(lambda x: x.count('.'))
    df['commas'] = df['text'].apply(lambda x: x.count(','))
    
    df['digits_count'] = df['text'].apply(lambda x: sum(c.isdigit() for c in x))
    df['uppercase_letters'] = df['text'].apply(lambda x: sum(c.isupper() for c in x))
    df['uppercase_words'] = df['text'].apply(lambda x: len(re.findall(r'\b[A-Z]+\b', x)))
    df['punctuation_count'] = df['text'].apply(lambda x: sum(1 for c in x if c in string.punctuation))
    df['hashtag_count'] = df['text'].apply(lambda x: x.count('#'))
    df['mention_count'] = df['text'].apply(lambda x: x.count('@'))
    
    df['unique_words_ratio'] = df.apply(
        lambda row: len(set(row['text'].split())) / row['word_count'] if row['word_count'] > 0 else 0, 
        axis=1
    )
    df['longest_word_length'] = df['text'].apply(
        lambda x: max(len(word) for word in x.split()) if len(x.split()) > 0 else 0
    )
    
    tfidf = TfidfVectorizer(max_features=1000)
    tfidf_matrix = tfidf.fit_transform(df['text'])
    with open('gpt1/tfidf.pickle', 'wb') as f:
        pickle.dump(tfidf, f)
    
    svd = TruncatedSVD(n_components=50)
    text_features = svd.fit_transform(tfidf_matrix)
    with open('gpt1/svd.pickle', 'wb') as f:
        pickle.dump(svd, f)
    
    text_feature_df = pd.DataFrame(text_features, columns=[f'text_svd_{i}' for i in range(50)])
    df = df.reset_index(drop=True)
    df = pd.concat([df, text_feature_df], axis=1)
    df.drop(columns=['text'], inplace=True)
    
    return df

In [7]:
def preprocess_data(df):
    df = extract_image_features(df)
    df = extract_text_features(df)
    return df

In [8]:
def img_vectorizer(photo_base64):
    img = np.array(Image.open(BytesIO(base64.b64decode(photo_base64))))
    s = img.shape
    if len(s) == 2:
        # Если изображение в градациях серого, повторяем по каналам
        img = np.repeat(img[..., np.newaxis], 3, axis=2)
    h, w = img.shape[0], img.shape[1]
    # Приводим изображение к виду (h*w, 3)
    img = img.reshape(-1, 3)
    stats = []
    stats.append(np.array([h, w]))
    stats.append(img.min(axis=0))
    stats.append(img.max(axis=0))
    stats.append(img.mean(axis=0))
    stats.append(img.std(axis=0))
    stats.append(np.median(img, axis=0))
    # Корреляционная матрица – верхний треугольник (без диагонали)
    cm = np.corrcoef(img.T)
    stats.append(cm[np.triu_indices(len(cm), k=1)])
    return np.concatenate(stats)

In [9]:
data = pd.read_csv('data/train.csv')
train = data.copy()
data = preprocess_data(data)

100%|██████████| 20000/20000 [07:17<00:00, 45.73it/s]


In [10]:
train["text"] = train["text"].fillna("")

In [11]:
train["img_features"] = train["photo"].progress_apply(lambda x: img_vectorizer(x))
X_img = np.vstack(train["img_features"].values)

100%|██████████| 20000/20000 [01:28<00:00, 225.12it/s]


In [25]:
def extract_image_features(df):
    img_sizes = []
    img_means = []
    img_stds = []
    img_mins = []
    img_maxs = []
    img_entropies = []
    
    for img_b64 in tqdm(df['photo'].fillna("")):
        img = decode_image(img_b64)
        
        img_sizes.append(img.shape[0] * img.shape[1])
        img_means.append(img.mean())
        img_stds.append(img.std())
        img_mins.append(img.min())
        img_maxs.append(img.max())
        
        hist, _ = np.histogram(img.ravel(), bins=256, range=(0, 256))
        hist = hist[hist > 0]
        probabilities = hist / hist.sum()
        entropy = -np.sum(probabilities * np.log2(probabilities))
        img_entropies.append(entropy)

    df['img_size'] = img_sizes
    df['brightness'] = img_means 
    df['contrast_std'] = img_stds 
    df['img_min'] = img_mins
    df['img_max'] = img_maxs
    
    df['contrast_range'] = df['img_max'] - df['img_min']  
    df['coefficient_of_variation'] = df['contrast_std'] / (df['brightness'] + 1e-9)  
    df['entropy'] = img_entropies
    
    df.drop(columns=['photo'], inplace=True)
    return df

In [13]:
train = extract_image_features(train)

100%|██████████| 20000/20000 [00:32<00:00, 619.94it/s]


In [14]:
avg_img_vector = np.mean(X_img, axis=0).reshape(1, -1)
cos_sim = cosine_similarity(X_img, avg_img_vector).flatten()
train["cosine_sim"] = cos_sim

In [15]:
X_numeric = pd.DataFrame(X_img)
X_numeric["cosine_sim"] = train["cosine_sim"]

In [16]:
X_train_all = X_numeric.copy()
X_train_all["text"] = train["text"]

In [17]:
targets = ['like', 'comment', 'hide', 'expand', 'open_photo', 'open', 'share_to_message']
features = [col for col in data.columns if col not in targets]
views = data.view

In [18]:
all_data = pd.concat([data[features], X_train_all], axis=1).rename(columns=dict(zip(list(range(20)), list(map(str, list(range(20)))))))

In [19]:
X_train, X_valid, y_train, y_valid, view_train, view_valid = train_test_split(all_data, data[targets], views, test_size=0.2, random_state=42, shuffle=True)

In [26]:
models = {}
score_mean = 0
params = {
    'like': {'iterations': 1200, 'verbose': 0, 'eval_metric': 'R2'},
    'comment': {'iterations': 700, 'verbose': 0, 'eval_metric': 'R2'},
    'hide': {'iterations': 700, 'verbose': 0, 'eval_metric': 'R2'},
    'expand': {'iterations': 900, 'verbose': 0, 'eval_metric': 'R2'},
    'open_photo': {'iterations': 500, 'verbose': 0, 'eval_metric': 'R2'},
    'open': {'iterations': 800, 'verbose': 0, 'eval_metric': 'R2'},
    'share_to_message': {'iterations': 1000, 'verbose': 0, 'eval_metric': 'R2'},
}

for col in tqdm(targets):
    if col == 'comment' or col == 'hide':
        model = KNeighborsRegressor(200)
    elif col == 'open_photo':
        model = KNeighborsRegressor(80)
    else:
        model = CatBoostRegressor(**params[col], random_seed=42, text_features=['text'])
    
    if col == 'open_photo':
        model.fit(X_train.drop(columns=['text']), y_train[col])
        score = r2_score(y_valid[col], model.predict(X_valid.drop(columns=['text'])))
    else:
        if col in ['comment', 'hide']:
            model.fit(X_train.drop(columns=['text']), y_train[col] / view_train)
            score = r2_score(y_valid[col] / view_valid, model.predict(X_valid.drop(columns=['text'])))
        else:
            model.fit(X_train, y_train[col] / view_train)
            score = r2_score(y_valid[col] / view_valid, model.predict(X_valid))
    score_mean += score
    print(col, score)
    models[col] = model

print(score_mean / len(targets))

 14%|█▍        | 1/7 [01:16<07:38, 76.39s/it]

like 0.1876314150825914


 29%|██▊       | 2/7 [01:16<02:38, 31.70s/it]

comment -0.0017814723261069254


 43%|████▎     | 3/7 [01:17<01:09, 17.48s/it]

hide -0.00126447323057155


 57%|█████▋    | 4/7 [02:16<01:41, 33.84s/it]

expand 0.5267200248799238


 71%|███████▏  | 5/7 [02:16<00:43, 21.71s/it]

open_photo 0.2605927673461057


 86%|████████▌ | 6/7 [03:08<00:32, 32.18s/it]

open 0.44224149082279773


100%|██████████| 7/7 [04:18<00:00, 36.86s/it]

share_to_message 0.12555175513183536
0.2199559296723679





In [None]:
with open('models.pickle', 'wb') as f:
    pickle.dump(models, f)

In [21]:
from sklearn.model_selection import KFold
from sklearn.neighbors import KNeighborsRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score
import numpy as np
from tqdm import tqdm

models = {}
params = {
    'like': {'iterations': 1200, 'verbose': 0, 'eval_metric': 'R2'},
    'comment': {'iterations': 700, 'verbose': 0, 'eval_metric': 'R2'},
    'hide': {'iterations': 700, 'verbose': 0, 'eval_metric': 'R2'},
    'expand': {'iterations': 900, 'verbose': 0, 'eval_metric': 'R2'},
    'open_photo': {'iterations': 500, 'verbose': 0, 'eval_metric': 'R2'},
    'open': {'iterations': 800, 'verbose': 0, 'eval_metric': 'R2'},
    'share_to_message': {'iterations': 1000, 'verbose': 0, 'eval_metric': 'R2'},
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
score_mean = 0
fold_scores = []

for col in tqdm(targets):
    fold_r2_scores = []
    for train_idx, valid_idx in tqdm(kf.split(all_data)):
        X_tr, X_val = all_data.iloc[train_idx], all_data.iloc[valid_idx]
        y_tr, y_val = data[targets][col].iloc[train_idx], data[targets][col].iloc[valid_idx]
        view_tr, view_val = views.iloc[train_idx], views.iloc[valid_idx]

        if col == 'comment' or col == 'hide':
            model = KNeighborsRegressor(200)
        elif col == 'open_photo':
            model = KNeighborsRegressor(80)
        else:
            model = CatBoostRegressor(**params[col], random_seed=42, text_features=['text'])

        if col == 'open_photo':
            model.fit(X_tr.drop(columns=['text']), y_tr)
            score = r2_score(y_val, model.predict(X_val.drop(columns=['text'])))
            fold_r2_scores.append(score)
        else:
            if col in ['comment', 'hide']:
                model.fit(X_tr.drop(columns=['text']), y_tr / view_tr)
                score = r2_score(y_val / view_val, model.predict(X_val.drop(columns=['text'])))
                fold_r2_scores.append(score)
            else:
                model.fit(X_tr, y_tr / view_tr)
                score = r2_score(y_val / view_val, model.predict(X_val))
                fold_r2_scores.append(score)

    mean_r2 = np.mean(fold_r2_scores)
    fold_scores.append(mean_r2)
    models[col] = model
    print(f"{col}: mean R2 = {mean_r2:.5f}")

score_mean = np.mean(fold_scores)
print(f"Overall mean R2: {score_mean:.5f}")

5it [06:59, 83.84s/it]00:00<?, ?it/s]
 14%|█▍        | 1/7 [06:59<41:55, 419.23s/it]

like: mean R2 = 0.18450


5it [00:02,  2.46it/s]
 29%|██▊       | 2/7 [07:01<14:29, 173.82s/it]

comment: mean R2 = -0.00065


5it [00:02,  2.37it/s]
 43%|████▎     | 3/7 [07:03<06:21, 95.42s/it] 

hide: mean R2 = 0.00345


5it [05:07, 61.55s/it]
 57%|█████▋    | 4/7 [12:11<08:57, 179.25s/it]

expand: mean R2 = 0.53345


5it [00:01,  4.34it/s]
 71%|███████▏  | 5/7 [12:12<03:50, 115.02s/it]

open_photo: mean R2 = 0.29057


5it [04:26, 53.38s/it]
 86%|████████▌ | 6/7 [16:39<02:46, 166.67s/it]

open: mean R2 = 0.45585


5it [05:35, 67.19s/it]
100%|██████████| 7/7 [22:15<00:00, 190.74s/it]

share_to_message: mean R2 = 0.11278
Overall mean R2: 0.22571





In [None]:
models = {}
params = {
    'like': {'iterations': 1200, 'verbose': 0, 'eval_metric': 'R2'},
    'comment': {'iterations': 700, 'verbose': 0, 'eval_metric': 'R2'},
    'hide': {'iterations': 700, 'verbose': 0, 'eval_metric': 'R2'},
    'expand': {'iterations': 900, 'verbose': 0, 'eval_metric': 'R2'},
    'open_photo': {'iterations': 500, 'verbose': 0, 'eval_metric': 'R2'},
    'open': {'iterations': 800, 'verbose': 0, 'eval_metric': 'R2'},
    'share_to_message': {'iterations': 1000, 'verbose': 0, 'eval_metric': 'R2'},
}

for col in tqdm(targets):
    if col == 'comment' or col == 'hide':
        model = KNeighborsRegressor(200)
    elif col == 'open_photo':
        model = KNeighborsRegressor(80)
    else:
        model = CatBoostRegressor(**params[col], random_seed=42, text_features=['text'], max_depth=9)
    
    if col == 'open_photo':
        model.fit(all_data.drop(columns=['text']), data[col])
    else:
        if col in ['comment', 'hide']:
            model.fit(all_data.drop(columns=['text']), data[col] / views)
        else:
            model.fit(all_data, data[col] / views)
    models[col] = model

  0%|          | 0/7 [00:00<?, ?it/s]

In [None]:
with open('gpt1/models.pickle', 'wb') as f:
    pickle.dump(models, f)