In [1]:
import pandas as pd
from tqdm import tqdm
import numpy as np
import sys
from time import sleep
import pickle

sys.path.append('../src/')
from utils.parsing import get_user_location, get_median_friend_age

In [2]:
users = pd.read_json('../data/interim/users.json')

In [None]:
users.predicted_age = users.predicted_age.astype(int)
users = users.query('predicted_age < 60')
users = users.reset_index(drop=True)

In [None]:
# Creating new feature - weighted_age
# It will be some weighted average between predicted age and age from user's social page
# Weights were found experimentally
weighted_age = []
for i in tqdm(range(users.shape[0])):
    new_age = users.iloc[i].new_age
    pred_age = users.iloc[i].predicted_age
    diff = abs(new_age - pred_age)
    if diff > 10:
        alpha = 0.2
    elif diff <= 10 and diff > 5:
        alpha = 0.5
    else:
        alpha = 0.8
    weighted_age.append(int(alpha * new_age + (1-alpha) * pred_age))
    
users['weighted_age'] = weighted_age

In [3]:
users = users.query('weighted_age < 45').reset_index(drop=True)
print(f"{users.shape[0]} users remained in the dataset")

68674 users remained in the dataset


In [4]:
pred_cities_dict = pd.read_pickle("../data/byproduct/pred_cities_dict.pkl")

In [5]:
users

Unnamed: 0,id,photo,sex,is_closed,city,country,likes,weighted_age
0,11946,https://sun6-22.userapi.com/s/v1/ig2/KGV4KtNi0...,1,False,"{'id': 2, 'title': 'Санкт-Петербург'}","{'id': 1, 'title': 'Россия'}",8,35
1,62102,https://sun6-21.userapi.com/s/v1/ig2/iu0M-ERxn...,1,False,,,9,35
2,662418,https://sun6-21.userapi.com/s/v1/if2/gxSDA6_Q6...,2,False,"{'id': 1964269, 'title': 'Hong Kong'}","{'id': 97, 'title': 'Китай'}",16,35
3,1277906,https://sun6-23.userapi.com/s/v1/if1/yzq3SNC64...,1,False,"{'id': 1, 'title': 'Москва'}","{'id': 1, 'title': 'Россия'}",10,34
4,1345035,https://sun6-21.userapi.com/s/v1/if1/1xpNC-mZ-...,1,False,"{'id': 1, 'title': 'Москва'}","{'id': 1, 'title': 'Россия'}",17,35
...,...,...,...,...,...,...,...,...
68669,785057091,https://sun6-23.userapi.com/s/v1/ig2/u-MsVlvpB...,1,False,,,6,21
68670,785828149,https://sun6-23.userapi.com/s/v1/ig2/EViT7gFq4...,1,False,,,145,18
68671,786259561,https://sun6-20.userapi.com/s/v1/ig2/2HRZL37Cf...,2,False,,,3,19
68672,787242273,https://sun6-23.userapi.com/s/v1/ig2/RR7oi_6Ka...,1,False,"{'id': 1, 'title': 'Москва'}","{'id': 1, 'title': 'Россия'}",10,24


In [6]:
users_without_city = users[users.city.isna()].id.values

In [7]:
len(users_without_city)

35557

In [8]:
stop = 0
for i, user_id in tqdm(enumerate(users_without_city[32538:35557])):
    location = get_user_location(user_id)
    if location == 'limit':
        print(f"{i}: limit")
        break
    pred_cities_dict[user_id] = location
    stop += 1
    if stop%3 == 0:
        sleep(1)

3019it [24:34,  2.05it/s]


In [9]:
len(pred_cities_dict)

35557

In [10]:
with open('../data/byproduct/pred_cities_dict.pkl', 'wb') as f:
    pickle.dump(pred_cities_dict, f)

In [6]:
pred_cities_dict = pd.read_pickle('../data/byproduct/pred_cities_dict.pkl')

In [11]:
cities = []
for i in range(users.shape[0]):
    if users.iloc[i].city:
        cities.append(users.iloc[i].city.get('title', None))
    else:
        id = users.iloc[i].id
        if pred_cities_dict.get(id, None):
            cities.append(pred_cities_dict[id][0])
        else:
            cities.append(None)

users['city'] = cities

In [21]:
countries = []
for i in range(users.shape[0]):
    if users.iloc[i].country:
        countries.append(users.iloc[i].country.get('title', None))
    else:
        id = users.iloc[i].id
        if pred_cities_dict.get(id, None):
            countries.append(pred_cities_dict[id][1])
        else:
            countries.append(None)

users['country'] = countries

In [27]:
# delete unnecessary column, every user has opened profile
# delete people with more than 1000 likes, assuming they like
# posts just on a daily basis, not because they really like them
users = users.drop(columns='is_closed')
users = users.query('likes < 1000').reset_index(drop=True)
users

Unnamed: 0,id,photo,sex,city,country,likes,weighted_age
0,11946,https://sun6-22.userapi.com/s/v1/ig2/KGV4KtNi0...,1,Санкт-Петербург,Россия,8,35
1,62102,https://sun6-21.userapi.com/s/v1/ig2/iu0M-ERxn...,1,Москва,Россия,9,35
2,662418,https://sun6-21.userapi.com/s/v1/if2/gxSDA6_Q6...,2,Hong Kong,Китай,16,35
3,1277906,https://sun6-23.userapi.com/s/v1/if1/yzq3SNC64...,1,Москва,Россия,10,34
4,1345035,https://sun6-21.userapi.com/s/v1/if1/1xpNC-mZ-...,1,Москва,Россия,17,35
...,...,...,...,...,...,...,...
67486,785057091,https://sun6-23.userapi.com/s/v1/ig2/u-MsVlvpB...,1,Xi'an,Россия,6,21
67487,785828149,https://sun6-23.userapi.com/s/v1/ig2/EViT7gFq4...,1,Москва,Россия,145,18
67488,786259561,https://sun6-20.userapi.com/s/v1/ig2/2HRZL37Cf...,2,Казань,Россия,3,19
67489,787242273,https://sun6-23.userapi.com/s/v1/ig2/RR7oi_6Ka...,1,Москва,Россия,10,24


In [29]:
users.to_parquet('../data/processed/users.parquet.gzip', compression='gzip')