In [1]:
%%javascript
// Шоткат для перемещения к исполняемой
Jupyter.keyboard_manager.command_shortcuts.add_shortcut('Alt-I', {
    help : 'Go to Running cell',
    help_index : 'zz',
    handler : function (event) {
        setTimeout(function() {
            // Find running cell and click the first one
            if ($('.running').length > 0) {
                //alert("found running cell");
                $('.running')[0].scrollIntoView();
            }}, 250);
        return false;
    }
});

<IPython.core.display.Javascript object>

In [2]:
import sys
import os
import warnings
os.environ['OPENBLAS_NUM_THREADS'] = '1'
os.environ['DASK_TEMP_DIR'] = 'Z:/temp_data'
warnings.filterwarnings('ignore')

In [3]:
# импортируем библиотеку pandarallel и инициализируем её
from pandarallel import pandarallel
pandarallel.initialize(progress_bar=True, nb_workers=6)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.

https://nalepae.github.io/pandarallel/troubleshooting/


In [4]:
import tqdm
import pickle 
import random
import seaborn as sns
import matplotlib.pyplot as plt
from fuzzywuzzy import fuzz
import idna

In [5]:
import pandas as pd
import numpy as np
import time
import pyarrow.parquet as pq
import pyarrow.feather as pf
import pyarrow as pa
import scipy
import implicit
import bisect
import sklearn.metrics as m
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [6]:
OBJ_COLS = [
            'region_name', 
            'city_name', 
            'cpe_manufacturer_name',
            'cpe_model_name',
            'url_host',
            'cpe_type_cd',
            'cpe_model_os_type', 
#             'price', 
#             'date',
            'part_of_day', 
#             'request_cnt',
#             'user_id'
           ]

DIGIT_COLS = [
            'price', 
            'date',
            'request_cnt',
            'user_id'
            ]

In [7]:
def fix_urls(url):
    import idna
    if '.turbopages.org' in url:
            url = url.replace('.turbopages.org', '').replace('--', '_').replace('-', '.')
    try:
        url = idna.decode(url)
        
    except idna.IDNAError:
        
        pass
        
    return url

In [8]:
targets = pd.read_parquet('E:/mts/public_train.pqt', engine='pyarrow')
id_to_submit = pd.read_parquet('E:/mts/submit_2.pqt')

with open ('E:/mts/unique_values_dict.pkl', 'rb') as f:
    unique_obj_dict = pickle.load(f)  
# unique_obj_dict['url_host'] = {fix_urls(key): value for key, value in unique_obj_dict['url_host'].items()}

In [9]:
# value_to_find = 30

# # Поиск ключа, соответствующего заданному значению
# for key, value in unique_obj_dict['url_host'].items():
#     if value == value_to_find:
#         print(f'Key found: {key}')
#         break
# else:
#     print('Key not found')

In [10]:
def reduce_mem_usage(df):
    
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))

    for col in df.columns:    
        
        
        col_type = df[col].dtype.name

        if col_type not in ['object', 'category', 'datetime64[ns]']:
                
                
                
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                    
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                        
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))

    return df #= reduce_mem_usage(df)



def get_maped_df():
    
    df_result = pd.DataFrame([])

    for col in OBJ_COLS:
        print(col)
        df_result[col] = df[col].map(unique_obj_dict[col])
        
    for col in DIGIT_COLS:
        print(col)
        df_result[col] = df[col]
        
    return reduce_mem_usage(df_result)


# def match_region(x):
#     from fuzzywuzzy import fuzz
#     import pickle 

#     with open ('E:/mts/unique_values_dict.pkl', 'rb') as f:
#         unique_obj_dict = pickle.load(f)  

#     max_ratio = 0
#     max_reg = x
#     for region in unique_obj_dict['region_name'].keys():
#         ratio = fuzz.ratio(x, region)
#         if ratio > max_ratio:
#             max_ratio = ratio
#             max_reg = region
#     return unique_obj_dict['region_name'][max_reg]




def round_by(num, base=1):
    temp = num // base * base
    return temp + base

def age_bucket(x):
    return bisect.bisect_left([18,25,35,45,55,65], x, lo=1)

In [11]:
def get_region_stat():
    
    
    dict_for_reg = {
        'Адыгея': 'Республика Адыгея',
         'Алтайский край': 'Алтайский край',
         'Амурская область': 'Амурская область',
         'Архангельская область': 'Архангельская область',
         'Астраханская область': 'Астраханская область',
         'Башкортостан': 'Республика Башкортостан',
         'Белгородская область': 'Белгородская область',
         'Брянская область': 'Брянская область',
         'Бурятия': None,
         'Владимирская область': 'Владимирская область',
         'Волгоградская область': 'Волгоградская область',
         'Вологодская область': 'Вологодская область',
         'Воронежская область': 'Воронежская область',
         'Дагестан': 'Республика Дагестан',
         'Еврейская автономная область': 'Еврейская АО',
         'Забайкальский край': None,
         'Ивановская область': 'Ивановская область',
         'Ингушетия': 'Республика Ингушетия',
         'Иркутская область': 'Иркутская область',
         'Кабардино-Балкария': 'Кабардино-Балкарская Республика',
         'Калининградская область': 'Калининградская область',
         'Калмыкия': 'Республика Калмыкия',
         'Калужская область': 'Калужская область',
         'Камчатский край': 'Камчатский край',
         'Карачаево-Черкесия': 'Карачаево-Черкесская Республика',
         'Карелия': 'Республика Карелия',
         'Кемеровская область': 'Кемеровская область - Кузбасс',
         'Кировская область': 'Кировская область',
         'Костромская область': 'Костромская область',
         'Краснодарский край': 'Краснодарский край',
         'Красноярский край': 'Красноярский край',
         'Крым': None,
         'Курганская область': 'Курганская область',
         'Курская область': 'Курская область',
         'Ленинградская область': 'Ленинградская область',
         'Липецкая область': 'Липецкая область',
         'Магаданская область': 'Магаданская область',
         'Марий Эл': 'Республика Марий Эл',
         'Мордовия': 'Республика Мордовия',
         'Москва': 'Москва',
         'Московская область': 'Московская область',
         'Мурманская область': 'Мурманская область',
         'Ненецкий автономный округ': 'Ненецкий АО',
         'Нижегородская область': 'Нижегородская область',
         'Новгородская область': 'Новгородская область',
         'Новосибирская область': 'Новосибирская область',
         'Омская область': 'Омская область',
         'Оренбургская область': 'Оренбургская область',
         'Орловская область': 'Орловская область',
         'Пензенская область': 'Пензенская область',
         'Пермский край': 'Пермский край',
         'Приморский край': 'Приморский край',
         'Псковская область': 'Псковская область',
         'Республика Алтай': 'Республика Алтай',
         'Республика Коми': 'Республика Коми',
         'Республика Тыва': 'Республика Тыва',
         'Ростовская область': 'Ростовская область',
         'Рязанская область': 'Рязанская область',
         'Самарская область': 'Самарская область',
         'Санкт-Петербург': 'Санкт-Петербург',
         'Саратовская область': 'Саратовская область',
         'Сахалинская область': 'Сахалинская область',
         'Свердловская область': 'Свердловская область',
         'Севастополь': None,
         'Северная Осетия': 'Республика Северная Осетия — Алания',
         'Смоленская область': 'Смоленская область',
         'Ставропольский край': 'Ставропольский край',
         'Тамбовская область': 'Тамбовская область',
         'Татарстан': 'Республика Татарстан',
         'Тверская область': 'Тверская область',
         'Томская область': 'Томская область',
         'Тульская область': 'Тульская область',
         'Тюменская область': 'Тюменская область',
         'Удмуртия': 'Удмуртская Республика',
         'Ульяновская область': 'Ульяновская область',
         'Хабаровский край': 'Хабаровский край',
         'Хакасия': 'Республика Хакасия',
         'Ханты-Мансийский автономный округ': 'Ханты-Мансийский АО — Югра',
         'Челябинская область': 'Челябинская область',
         'Чечня': 'Чеченская Республика',
         'Чувашия': 'Чувашская Республика',
         'Чукотский автономный округ': 'Чукотский АО',
         'Якутия': 'Республика Саха (Якутия)',
         'Ямало-Ненецкий автономный округ': 'Ямало-Ненецкий АО',
         'Ярославская область': 'Ярославская область'
                   }
    
    
    
    with open ('E:/mts/unique_values_dict.pkl', 'rb') as f:
        unique_obj_dict = pickle.load(f)  
 
    
    olds = pd.read_csv('E:/mts/side_data/Доля_пожилых.csv', header=None, names=['region_name', 'olds_in_region']).loc[2017]
    olds['region_name'] = olds.region_name.map(dict_for_reg).map(unique_obj_dict['region_name'])
    olds = olds.reset_index(drop=True)

    mens_in_region = pd.read_csv('E:/mts/side_data/Процент_мужчин.csv', header=None, names=['region_name', 'mens_in_region'])
    mens_in_region['region_name'] = mens_in_region.region_name.map(dict_for_reg).map(unique_obj_dict['region_name'])
    mens_in_region = mens_in_region.reset_index(drop=True)

    mean_age = pd.read_csv('E:/mts/side_data/Средний_возраст.csv', header=None, names=['region_name', 'mean_age_in_region'])
    mean_age['region_name'] = mean_age.region_name.map(dict_for_reg).map(unique_obj_dict['region_name'])
    mean_age = mean_age.reset_index(drop=True)

    population_region = pd.read_csv('E:/mts/side_data/Численность_населения.csv', header=None, names=['region_name', 'population_in_region']).loc[2019]
    population_region['region_name'] = population_region.region_name.map(dict_for_reg).map(unique_obj_dict['region_name'])
    population_region = population_region.reset_index(drop=True)




    region_statistic = pd.concat(
        [
         olds,
         mens_in_region['mens_in_region'],
         mean_age['mean_age_in_region'], 
         population_region['population_in_region']
        ], 
        axis=1)    

    region_statistic['olds_in_region'] = round((region_statistic['olds_in_region'] / 100) * region_statistic['population_in_region']).astype('int')
    region_statistic['mens_in_region'] = round((region_statistic['mens_in_region'] / 100) * region_statistic['population_in_region']).astype('int')
    region_statistic['mean_age_in_region'] = round(region_statistic['mean_age_in_region']).astype('int')
    region_statistic = region_statistic.drop('population_in_region', axis=1)

    scaler = StandardScaler()
    region_statistic[['olds_in_region', 'mens_in_region', 'mean_age_in_region']] = scaler.fit_transform(region_statistic[['olds_in_region', 'mens_in_region', 'mean_age_in_region']])


    return region_statistic

In [12]:
# with open ('E:/mts/unique_values_dict.pkl', 'rb') as f:
#         unique_obj_dict = pickle.load(f)  
# a = get_region_stat()

In [13]:
SPLIT_SEED = 555

In [14]:
%%script false

DIRECTORY = "E:/mts/competition_data_final_pqt"

print('Кладем паркет')
files = [DIRECTORY + '/' + os.path.join(f) for f in os.listdir(DIRECTORY) if f.endswith('.parquet')]
dfs = [pd.read_parquet(p, engine='pyarrow') for p in files[:]]
df = pd.concat(dfs, ignore_index=True)
df['date'] = df['date'].map(pd.Timestamp)

print('Собираем уникальные значения')
unique_obj_dict = {}

for col in OBJ_COLS:
    print(f'col_{col}')
    unique_obj_dict[col] = set(df[col].values)
    
for d in unique_obj_dict:
    unique_obj_dict[d] = dict(zip(unique_obj_dict[d], range(len(unique_obj_dict[d]))))

    
with open ('E:/mts/unique_values_dict.pkl', 'wb') as f:
    pickle.dump(unique_obj_dict, f)

with open ('E:/mts/unique_values_dict.pkl', 'rb') as f:
    unique_obj_dict = pickle.load(f)  

print('Меняем типы данных')
df = get_maped_df()
    
print('Причесываем датафрейм')

df = df.reset_index()
df = df.drop('index', axis=1)
df.to_feather('E:/mts/total.feather')

Couldn't find program: 'false'


In [15]:
# Поправил косяки с urls

# df = pd.read_feather('E:/mts/total.feather')
# rev_dict = {value: key for key, value in unique_obj_dict['url_host'].items()}
# df['url_host'] = df['url_host'].map(rev_dict).parallel_map(fix_urls)

# df = df.reset_index()
# df = df.drop('index', axis=1)
# df.to_feather('E:/mts/total.feather')

In [16]:
%%time
df = pd.read_feather('E:/mts/total.feather')



CPU times: total: 2min 57s
Wall time: 2min 46s


In [17]:
median_price = round(df['price'].median())
df['price'] = df['price'].fillna(median_price).parallel_apply(round_by, base=1000)


region_popularity = df.groupby('region_name')['request_cnt'].sum()
df['region_popularity'] = df['region_name'].map(region_popularity)

city_popularity = df.groupby('city_name')['request_cnt'].sum()
df['city_popularity'] = df['city_name'].map(city_popularity)

df['city_region_popularity'] = df['city_popularity'] / df['region_popularity']
# df['city_region_popularity_mul'] = df['city_popularity'] * df['region_popularity']

# df = df.drop(['region_popularity', 'city_popularity'], axis=1)


df['day_of_year'] = df['date'].dt.day_of_year
df['year'] = df['date'].dt.year

df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.day_of_week

threshold = 3
df['close_to_march8'] = ((df['month'] == 3) & (abs(df['day'] - 8) <= threshold)).astype(int)
df['close_to_feb23'] = ((df['month'] == 2) & (abs(df['day'] - 23) <= threshold)).astype(int)
df['close_to_valentine'] = ((df['month'] == 2) & (abs(df['day'] - 14) <= threshold)).astype(int)

df['is_weekend'] = (df['day_of_week'] > 5).astype(int)
df['is_friday'] = (df['day_of_week'] == 4).astype(int)

df = df.drop(['day', 'month'], axis=1)

df = reduce_mem_usage(df)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=53816573), Label(value='0 / 538165…

Memory usage of dataframe is 34489.38 MB
Memory usage after optimization is: 20324.10 MB
Decreased by 41.1%


# _______________

In [18]:
df

Unnamed: 0,region_name,city_name,cpe_manufacturer_name,cpe_model_name,url_host,cpe_type_cd,cpe_model_os_type,part_of_day,price,date,...,city_popularity,city_region_popularity,day_of_year,year,day_of_week,close_to_march8,close_to_feb23,close_to_valentine,is_weekend,is_monday
0,27,673,4,347,ad.adriver.ru,3,2,1,21000.0,2022-06-15,...,16437151,0.344971,166,2022,2,0,0,0,0,0
1,27,673,4,347,apple.com,3,2,1,21000.0,2022-06-19,...,16437151,0.344971,170,2022,6,0,0,0,1,0
2,27,673,4,347,avatars.mds.yandex.net,3,2,3,21000.0,2022-06-12,...,16437151,0.344971,163,2022,6,0,0,0,1,0
3,27,673,4,347,googleads.g.doubleclick.net,3,2,3,21000.0,2022-05-16,...,16437151,0.344971,136,2022,0,0,0,0,0,0
4,27,673,4,347,googleads.g.doubleclick.net,3,2,3,21000.0,2022-05-30,...,16437151,0.344971,150,2022,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
322899430,0,363,26,521,avatars.mds.yandex.net,3,0,1,17000.0,2021-07-12,...,576698,0.237427,193,2021,0,0,0,0,0,0
322899431,0,363,26,521,googleads.g.doubleclick.net,3,0,2,17000.0,2021-06-20,...,576698,0.237427,171,2021,6,0,0,0,1,0
322899432,0,363,26,521,online.sberbank.ru,3,0,3,17000.0,2021-08-05,...,576698,0.237427,217,2021,3,0,0,0,0,0
322899433,0,363,26,521,s0.2mdn.net,3,0,2,17000.0,2021-07-19,...,576698,0.237427,200,2021,0,0,0,0,0,0


In [None]:
targets = pd.read_parquet('E:/mts/public_train.pqt', engine='pyarrow')

# _________________

# _________________

In [None]:
targets

# Date 

# Время между первым и последним посещением  сети

In [None]:
agg_time_between_first_last_visit = (df[['user_id', 'date']].\
    groupby('user_id').max() - df[['user_id', 'date']].groupby('user_id').min()).\
    rename(columns={'date': 'days_first_to_last_visit'})
agg_time_between_first_last_visit['days_first_to_last_visit'] = agg_time_between_first_last_visit['days_first_to_last_visit'].\
                                                                        apply(lambda x: x.days)

train = targets.merge(agg_time_between_first_last_visit.reset_index(), how='inner', on='user_id')

In [None]:
plt.figure(figsize=(30, 20))
agg_time_between_first_last_visit['days_first_to_last_visit'].value_counts().plot(kind='bar')
plt.xticks(rotation=90);

In [None]:
def days_first_to_last_visit_bucket(x):
    return bisect.bisect_left([0,20,50,75,90], x, lo=0)

In [None]:
agg_time_between_first_last_visit['days_first_to_last_visit'] = agg_time_between_first_last_visit.days_first_to_last_visit.map(days_first_to_last_visit_bucket)
agg_time_between_first_last_visit = pd.get_dummies(agg_time_between_first_last_visit['days_first_to_last_visit'], prefix='first_last')

In [None]:
train = train.merge(agg_time_between_first_last_visit, how='inner', on = 'user_id')

In [None]:
print(f"Процент пропусков таргета по полу {(len(targets[targets['is_male'] == 'NA']) / len(targets)) * 100}")

# Среднее время пребывания в сети

In [None]:
def mean_time_using_internet(data):
    
    import pandas as pd
    
    data = data.sort_values(by='date')
    intervals = data['date'].diff()
    using_internet = intervals[intervals > pd.Timedelta(0)]
    return round(using_internet.mean() / pd.Timedelta(hours=1), 2)



In [None]:
%%script false
mean_using_internet = df.groupby('user_id').parallel_apply(mean_time_using_internet)
mean_using_internet.fillna(0, inplace=True)
pd.DataFrame(mean_using_internet).rename({0: 'mean_using_internet'}, axis=1).to_csv('E:/mts/mean_using_internet.csv')

In [None]:
mean_using_internet = pd.read_csv('E:/mts/mean_using_internet.csv')

In [None]:
median_use_int = round(mean_using_internet.mean_using_internet.median(), 2)

In [None]:
mean_using_internet['mean_using_internet'] = np.log(mean_using_internet.mean_using_internet.apply(lambda x: median_use_int if x == 0 else x))

In [None]:
mean_using_internet

In [None]:
train = train.merge(mean_using_internet, how='inner', on = 'user_id')

# Количество посещений интеренета по времени суток

In [None]:
data_agg_user_time_per_part_of_day = pd.DataFrame(df[['user_id', 'part_of_day', 'year', 'day_of_year']]\
                                                  .groupby(['user_id', 'year', 'part_of_day'])['day_of_year'].max()\
                                                    - df[['user_id', 'part_of_day', 'year', 'day_of_year']]\
                                                  .groupby(['user_id', 'year', 'part_of_day'])['day_of_year'].min())
data_agg_user_time_per_part_of_day = data_agg_user_time_per_part_of_day.reset_index()

In [None]:
pivot_agg_sum_days_per_part_of_days = data_agg_user_time_per_part_of_day\
    .pivot_table(index=['user_id', 'part_of_day'], columns='year', values='day_of_year', aggfunc='mean')

pivot_agg_sum_days_per_part_of_days.fillna(0, inplace=True)

pivot_agg_sum_days_per_part_of_days['sum_usertime_per_part_day'] = pivot_agg_sum_days_per_part_of_days[2021]\
                                                                    + pivot_agg_sum_days_per_part_of_days[2022]

pivot_agg_sum_days_per_part_of_days.drop([2021, 2022], axis=1, inplace=True)
pivot_agg_sum_days_per_part_of_days.reset_index(inplace=True)

In [None]:
values = pivot_agg_sum_days_per_part_of_days['sum_usertime_per_part_day'].values

rows = pivot_agg_sum_days_per_part_of_days['user_id']

cols = pivot_agg_sum_days_per_part_of_days['part_of_day']

mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))

als = implicit.approximate_als.FaissAlternatingLeastSquares(nlist=4, factors = 10, iterations = 30, use_gpu = False, \
       calculate_training_loss = False, regularization = 0.1)

als.fit(mat)

u_factors = als.model.user_factors 
d_factors = als.model.item_factors

user_embs_usertime_per_part_day = pd.DataFrame(u_factors)
user_embs_usertime_per_part_day['user_id'] = user_embs_usertime_per_part_day.index
user_embs_usertime_per_part_day.columns = list(map(str, user_embs_usertime_per_part_day.columns))


train = train.merge(user_embs_usertime_per_part_day, how='inner', on = 'user_id')

In [None]:
train

# _____________

### Сумма посещений по сайтам

In [None]:
data_agg_request = df[['user_id', 'url_host', 'request_cnt']].\
    groupby(['user_id', 'url_host']).agg({'request_cnt': 'sum'}).\
    rename(columns={'request_cnt': 'request_cnt_sum'})

In [None]:
data_agg_request

In [None]:
url_set = set(data_agg_request.index.get_level_values(1))
url_dict = {url: idurl for url, idurl in zip(url_set, range(len(url_set)))}

In [None]:
values = np.array(data_agg_request['request_cnt_sum'])

rows = np.array(data_agg_request.index.get_level_values(0))

cols = np.array(data_agg_request.index.get_level_values(1).map(url_dict))

mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))

als = implicit.approximate_als.FaissAlternatingLeastSquares(nlist=200, factors = 40, iterations = 40, use_gpu = False, \
       calculate_training_loss = False, regularization = 0.1)

als.fit(mat)

u_factors = als.model.user_factors 
d_factors = als.model.item_factors

user_embs_url_request_cnt_sum = pd.DataFrame(u_factors)
user_embs_url_request_cnt_sum['user_id'] = user_embs_url_request_cnt_sum.index
user_embs_url_request_cnt_sum.columns = list(map(str, user_embs_url_request_cnt_sum.columns))


In [None]:
train = train.merge(user_embs_url_request_cnt_sum, how='inner', on = 'user_id')

# _________

# ТУТ ПОДУМАТЬ 
## НАПРИМЕР ПО КУЧКАМ ДОМЕНЫ СОБРАТЬ И ПОСМОТРЕТЬ ТЕМАТИЧЕСКИЕ

# Top Level Domain

In [None]:
df['tld'] = df['url_host'].apply(lambda x: x.strip().split('.')[-1])

In [None]:
data_agg_tld = df[['user_id', 'tld', 'request_cnt']].groupby(['user_id', 'tld'], as_index=False)\
                                        .agg({'request_cnt': 'sum'})\
                                        .rename(columns={'request_cnt': 'requests_count_per_domain'})



In [None]:
data_agg_tld

In [None]:
tld_set = set(data_agg_tld['tld'])
tld_dict = {tld: idtld for tld, idtld in zip(tld_set, range(len(tld_set)))}

In [None]:
values = np.array(data_agg_tld['requests_count_per_domain'])

rows = np.array(data_agg_tld['user_id'])

cols = np.array(data_agg_tld['tld'].map(tld_dict))

mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))

als = implicit.approximate_als.FaissAlternatingLeastSquares(factors = 30, iterations = 40, use_gpu = False, \
       calculate_training_loss = False, regularization = 0.1)

als.fit(mat)

u_factors = als.model.user_factors 
d_factors = als.model.item_factors

user_embs_tld_request_cnt_sum = pd.DataFrame(u_factors)
user_embs_tld_request_cnt_sum['user_id'] = user_embs_tld_request_cnt_sum.index
user_embs_tld_request_cnt_sum.columns = list(map(str, user_embs_tld_request_cnt_sum.columns))


In [None]:
train = train.merge(user_embs_tld_request_cnt_sum, how='inner', on = 'user_id')

In [None]:
train

# ________

### Сумма посещений по устройствам

In [None]:
data_agg_request_modelname = df[['user_id', 'cpe_model_name', 'request_cnt']].\
    groupby(['user_id', 'cpe_model_name']).agg({'request_cnt': 'sum'}).\
    rename(columns={'request_cnt': 'cpe_request_cnt_sum'})

In [None]:
data_agg_request_modelname

In [None]:
values = np.array(data_agg_request_modelname['cpe_request_cnt_sum'])

rows = np.array(data_agg_request_modelname.index.get_level_values(0))

cols = np.array(data_agg_request_modelname.index.get_level_values(1))

mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))

als = implicit.approximate_als.FaissAlternatingLeastSquares(factors = 30, iterations = 30, use_gpu = False, \
       calculate_training_loss = False, regularization = 0.1)

als.fit(mat)

u_factors = als.model.user_factors 
d_factors = als.model.item_factors

user_embs_cpe_request_cnt_sum = pd.DataFrame(u_factors)
user_embs_cpe_request_cnt_sum['user_id'] = user_embs_cpe_request_cnt_sum.index
user_embs_cpe_request_cnt_sum.columns = list(map(str, user_embs_cpe_request_cnt_sum.columns))


In [None]:
train = train.merge(user_embs_cpe_request_cnt_sum, how='inner', on = 'user_id')

### Активность в зависимости от времени суток

In [None]:
data_agg_part_of_day = df[['user_id', 'part_of_day', 'request_cnt']].\
    groupby(['user_id', 'part_of_day'], as_index=False).\
    agg(part_of_day_request_cnt = ('request_cnt', 'sum'))

In [None]:
data_agg_part_of_day

In [None]:
# data_agg_part_of_day = data_agg_part_of_day[['user_id', 'part_of_day_count']].groupby('user_id', as_index=False).median()

In [None]:
values = np.array(data_agg_part_of_day['part_of_day_request_cnt'])
rows = np.array(data_agg_part_of_day['user_id'])
cols = np.array(data_agg_part_of_day['part_of_day'])
mat = scipy.sparse.coo_matrix((values, (rows, cols)), shape=(rows.max() + 1, cols.max() + 1))
als = implicit.approximate_als.FaissAlternatingLeastSquares(nlist=4, factors = 20, iterations = 30, use_gpu = False, \
       calculate_training_loss = False, regularization = 0.1)
als.fit(mat)

u_factors = als.model.user_factors 
d_factors = als.model.item_factors

user_embs_part_of_day_request_sum = pd.DataFrame(u_factors)
user_embs_part_of_day_request_sum['user_id'] = user_embs_part_of_day_request_sum.index
user_embs_part_of_day_request_sum.columns = list(map(str, user_embs_part_of_day_request_sum.columns))




In [None]:
train = pd.merge(train, user_embs_part_of_day_request_sum, how='inner', on='user_id')

# Через dumm

In [None]:
data_agg_part_of_day_dumm = pd.concat([df['user_id'], pd.get_dummies(df['part_of_day'])], axis=1).groupby('user_id').sum()
data_agg_part_of_day_dumm.columns = ['morn', 'midday', 'evening', 'night']

In [None]:
train = pd.merge(train, data_agg_part_of_day_dumm, how='inner', on='user_id')

### Региональная статистика

провернуть через ембеддинги

In [None]:
data_agg_region = df[['user_id',
                      'region_name',
                      'region_popularity',
                      'city_popularity', 
                      'city_region_popularity',
                      'price', ]].merge(get_region_stat(), how='inner', on='region_name').\
                    groupby(['user_id'], as_index=False).agg({
                                                            'price': 'mean',
                                                            'olds_in_region': 'mean',
                                                            'mens_in_region': 'mean',
                                                            'city_popularity': 'mean',
                                                            'region_popularity': 'mean', 
                                                            'city_region_popularity': 'mean',
                                                            'mean_age_in_region': 'mean'
                                                            }).rename(columns={'price': 'mean_price'})

# train = train.merge(data_agg_region, how='inner', on='user_id')

In [None]:
train = train.merge(data_agg_region, how='inner', on='user_id')

In [None]:
train

# _________

In [None]:
df

In [None]:
df['close_to_march8']
df['close_to_feb23']
df['close_to_valentine'] 

df['is_weekend']
df['is_monday'] 

In [None]:
holidays_agg_data = df[['user_id', 'close_to_march8', 'close_to_feb23', 'close_to_valentine', 'is_weekend', 'is_monday']].\
    groupby(['user_id'], as_index=False).\
    agg(count_womens_days = ('close_to_march8', 'sum'),
        count_mens_days = ('close_to_feb23', 'sum'),
        count_valentines_days = ('close_to_valentine', 'sum'),
        count_weekends = ('is_weekend', 'sum'),
        count_fridays = ('is_monday', 'sum')
        
       )


holidays_agg_data['count_womens_days_bin'] = holidays_agg_data['count_womens_days'].apply(lambda x: 1 if x != 0 else 0)
holidays_agg_data['count_mens_days_bin'] = holidays_agg_data['count_mens_days'].apply(lambda x: 1 if x != 0 else 0)
holidays_agg_data['count_valentines_days_bin'] = holidays_agg_data['count_valentines_days'].apply(lambda x: 1 if x != 0 else 0)
holidays_agg_data['count_weekends_bin'] = holidays_agg_data['count_weekends'].apply(lambda x: 1 if x == 0 else 1)
holidays_agg_data['count_fridays_bin'] = holidays_agg_data['count_fridays'].apply(lambda x: 1 if x == 0 else 1)

holidays_agg_data['count_weekends_sum'] = holidays_agg_data['count_weekends']
holidays_agg_data['count_womens_days_sum'] = holidays_agg_data['count_womens_days']
holidays_agg_data['count_mens_days_sum'] = holidays_agg_data['count_mens_days']
holidays_agg_data['count_valentines_days_sum'] = holidays_agg_data['count_valentines_days']
holidays_agg_data['count_fridays_sum'] = holidays_agg_data['count_fridays']




# проверить

придумать бинарные фичи по признакам день матери день отца

In [None]:
# train = train.merge(holidays_agg_data, how='inner', on='user_id')

# _________

In [None]:
CAT_SEX_COLS = [
    'is_less_median_days',
    'is_0_days',
    'is_116_days',
    'is_more_75_days',
    
    
]

In [None]:
train.columns = [f'{col}_{i}' if train.columns.tolist().count(col) > 1 else col for i, col in enumerate(train.columns)]


In [None]:
train.to_csv('E:/mts/train.csv', index=False)

# SEX

In [None]:
asasdasdasd

In [None]:
train_sex = train[train['is_male'] != 'NA'].dropna()
train_sex['is_male'] = train_sex['is_male'].astype('int')
X_sex = train_sex.drop(['user_id', 'age', 'is_male'], axis = 1)
y_sex = train_sex['is_male']



In [None]:
%%time
x_train, x_val, y_train, y_val = train_test_split(\
    X_sex, y_sex, test_size = 0.1, random_state = SPLIT_SEED)
clf_sex = CatBoostClassifier(
    iterations=1500,
    task_type='GPU', 
    devices='0:1',
    custom_loss=['AUC', 'Accuracy']
)
clf_sex.fit(
    x_train, 
    y_train,
#     cat_features= CAT_SEX_COLS,
    verbose = False,
    plot=True,
    eval_set=(x_val, y_val),
       )
print(f'GINI по полу {2 * m.roc_auc_score(y_val, clf_sex.predict_proba(x_val)[:,1]) - 1:2.3f}')

In [None]:
print(m.classification_report(y_val, clf_sex.predict(x_val)))

In [None]:
plt.figure(figsize=(20, 10))
plt.bar(train.columns[3:],clf_sex.feature_importances_)
plt.xticks(rotation=90);

In [None]:
asdasdasd

In [None]:
# to_merge = [
#     agg_time_between_first_last_visit, 
#     user_embs_url_request_cnt_sum, 
#     user_embs_cpe_request_cnt_sum,
#     data_agg_part_of_day_dumm, 
#     data_agg_region, 
#     holidays_agg_data
# ]

In [None]:
id_to_submit.merge(agg_time_between_first_last_visit, how='inner', on='user_id').\
                                                merge(user_embs_url_request_cnt_sum, how='inner', on='user_id').\
                                                merge(user_embs_cpe_request_cnt_sum, how='inner', on='user_id').\
                                                merge(data_agg_part_of_day_dumm, how='inner', on='user_id').\
                                                merge(holidays_agg_data, how='inner', on='user_id')

In [None]:
clf_sex.fit(X_sex, y_sex, verbose = False)
id_to_submit['is_male'] = clf_sex.predict_proba((id_to_submit.merge(agg_time_between_first_last_visit, how='inner', on='user_id').\
                                                merge(user_embs_url_request_cnt_sum, how='inner', on='user_id').\
                                                merge(user_embs_cpe_request_cnt_sum, how='inner', on='user_id').\
                                                merge(data_agg_part_of_day_dumm, how='inner', on='user_id').\
                                                merge(holidays_agg_data, how='inner', on='user_id')).drop(['user_id'], axis=1))[:,1]

# Age

In [None]:
train_age

In [None]:
train_age = train[train['age'] != 'NA'].dropna()
train_age['age'] = train_age['age'].map(age_bucket).astype('int')
X_age = train_age.drop(['user_id', 'age', 'is_male'], axis = 1)
y_age = train_age['age']


In [None]:
y_age.hist()

In [None]:
%%time
x_train, x_val, y_train, y_val = train_test_split(X_age, y_age, test_size = 0.33, random_state = SPLIT_SEED)

clf_age = CatBoostClassifier(
    task_type='GPU', 
    devices='0:1',
    custom_loss=['AUC', 'Accuracy']
)

clf_age.fit(
    x_train, 
    y_train,
    verbose = False,
    plot=True,
    eval_set=(x_val, y_val),
       )
# print(f'GINI по полу {2 * m.roc_auc_score(y_test, clf.predict_proba(x_test)[:,1]) - 1:2.3f}')

In [None]:
predict = clf_age.predict(x_val).reshape(1, -1)[0]

In [None]:
plt.hist(predict)

In [None]:
plt.hist(y_val.values)

In [None]:
print(m.classification_report(y_val.values, predict, \
                            target_names = ['19-25', '26-35','36-45', '46-55', '56-65', '66+']))

In [None]:
clf_age.fit(X_age, y_age, verbose = False)
id_to_submit['age'] = clf_age.predict((id_to_submit.merge(agg_time_between_first_last_visit, how='inner', on='user_id').\
                                                merge(user_embs_url_request_cnt_sum, how='inner', on='user_id').\
                                                merge(user_embs_cpe_request_cnt_sum, how='inner', on='user_id').\
                                                merge(data_agg_part_of_day_dumm, how='inner', on='user_id').\
                                                merge(holidays_agg_data, how='inner', on='user_id')).drop(['user_id'], axis=1))




In [None]:
id_to_submit.to_csv('E:/mts/subs/submission_0.csv', index = False)

In [None]:
pd.read_csv('E:/mts/subs/submission_0.csv')

In [None]:


a = pd.read_feather('E:/mts/scrap/embs.feather')

In [None]:
a.set_index('url_host', inplace=True)

In [None]:
a

In [None]:
my_dict = {index: np.array(row) for index, row in a.iterrows()}

In [None]:
my_dict['last-frontier.ru']

In [None]:
for i in a:
    print(i)

In [None]:
a.drop('url_host', axis=1).values

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.preprocessing import QuantileTransformer

# Load data
data = pd.read_csv('data.csv')

# Define target column and feature columns
target_col = 'target'
feature_cols = [col for col in data.columns if col != target_col]

# Preprocess data using GaussRankScaler
scaler = QuantileTransformer(output_distribution='normal')
data[feature_cols] = scaler.fit_transform(data[feature_cols])

# Define PyTorch dataset and dataloader
class TabularDataset(Dataset):
    def __init__(self, data, target_col):
        self.features = data.drop(columns=[target_col]).values
        self.targets = data[target_col].values
        
    def __len__(self):
        return len(self.features)
    
    def __getitem__(self, idx):
        return torch.FloatTensor(self.features[idx]), torch.FloatTensor([self.targets[idx]])
    
batch_size = 64
dataset = TabularDataset(data, target_col)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Define model architecture
class TabularModel(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.hidden1 = nn.Linear(input_dim, hidden_dim)
        self.relu1 = nn.ReLU()
        self.hidden2 = nn.Linear(hidden_dim, hidden_dim)
        self.relu2 = nn.ReLU()
        self.output = nn.Linear(hidden_dim, output_dim)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, x):
        x = self.hidden1(x)
        x = self.relu1(x)
        x = self.hidden2(x)
        x = self.relu2(x)
        x = self.output(x)
        x = self.sigmoid(x)
        return x

input_dim = len(feature_cols)
hidden_dim = 32
output_dim = 1
model = TabularModel(input_dim, hidden_dim, output_dim)

# Train model
learning_rate = 0.01
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
criterion = nn.MSELoss()

num_epochs = 10
for epoch in range(num_epochs):
    running_loss = 0.0
    for i, (inputs, labels) in enumerate(dataloader):
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print('Epoch %d, loss: %.3f' % (epoch + 1, running_loss / len(dataloader)))

# Get embeddings for entire dataset
model.eval()
if torch.cuda.is_available():
    model = model.cpu()
embeddings = []
with torch.no_grad():
    for i, inputs in enumerate(data[feature_cols].values):
        if torch.cuda.is_available():
            inputs = torch.FloatTensor(inputs).cuda()
        else:
            inputs = torch.FloatTensor(inputs)
        outputs = model(inputs.unsqueeze(0))
        embeddings.append(outputs.cpu().numpy()[0][0])
embeddings = np.array(embeddings)

print('Embeddings shape:', embeddings.shape)
