## Crawling users with votes
## Selecting users with href like "/user/{id}/votes/"

In [None]:
import requests
import lxml.html
import tqdm
import json

In [None]:
base_url = 'https://www.kinopoisk.ru'

def make_url_birth_date(month, day):
    #https://www.kinopoisk.ru/community/birth_day/12-31/
    return '/'.join((base_url, 'community', 'birth_day', '-'.join((str(month),str(day)))))+'/'

def make_fetch_page_birth_day(birth_day_url, page):
    #https://www.kinopoisk.ru/community/birth_day/12-31/page/2/
    return '/'.join((birth_day_url, 'page', str(page))) + '/'

def birth_url_generator():
    urls = [make_url_birth_date(month, day) 
                for month in range(1,12+1)
                for day in range(1, 31+1)
           ]
    return urls

def get_html(response):
    if response.status_code != 200:
        return None
    else:
        return lxml.html.fromstring(response.content)

### Генерируем ссылки , будем fetch-иться по birth_day

In [None]:
for i, u in enumerate(birth_url_generator()):
    print (u)
    if i / 40 == 1:
        break

### Собираем список user_id таких пользователей, у которых имеется ссылка на раздел votes 
### Возможно, наличие такой ссылки означает наличие оценок фильмов

In [None]:
def users_with_votes_generator():
    for birth_day_url in birth_url_generator():
        try:
            current_html = get_html(requests.get(birth_day_url))
            max_page_id = max((int(fetch_url.split('/')[-2]) 
                               for fetch_url in current_html.xpath("//a[contains(@href,'/community/birth_day/')]/@href")))
            counter_max_page_id[birth_day_url] = max_page_id

            flag = False
            for a in current_html.xpath("//p[@class='profile_name']//a[contains(@href,'/votes')]"):
                username = a.text
                userid   = a.get("href").split('/')[2]
                if userid in already_added_users:
                    flag = True
                    continue
                counter_userid[birth_day_url] += 1
                yield (userid)
            
            if flag:
                print ("Passing url %s because one of users is already added" % birth_day_url)
                continue

            for page_id in range(2, max_page_id+1):
                current_html = get_html(requests.get(make_fetch_page_birth_day(birth_day_url,page_id)))
                for a in current_html.xpath("//p[@class='profile_name']//a[contains(@href,'/votes')]"):
                    username = a.text
                    userid   = a.get("href").split('/')[2]
                    
                    if userid in already_added_users:
                        flag = True
                        continue
                    counter_userid[birth_day_url] += 1
                    yield (userid)
                    
            if flag:
                print ("Passing url %s because one of users is already added" % birth_day_url)
                continue
        except Exception as e:
            print ("ERROR for %s: %s" % (birth_day_url,str(e)))
            counter_error [str(e)] += 1

In [None]:
users_with_votes_file = 'users_with_votes.txt'

### Процесс сбора идентификаторов пользователей

In [None]:
import collections

with open(users_with_votes_file, 'r') as f:
    already_added_users = {l.strip() for l in f.readlines()}
    
counter_max_page_id = collections.Counter()
counter_userid = collections.Counter()
counter_error  = collections.Counter()

In [None]:
users_with_votes_batch = list()
for i, user_i in enumerate(tqdm.tqdm(users_with_votes_generator())):
    if user_i in already_added_users:
        continue
    already_added_users.add(user_i)
    users_with_votes_batch.append(user_i)
    if i > 0 and i % 10000 == 0:
        with open(users_with_votes_file, 'a') as f:
            f.writelines([str(u)+'\n' for u in users_with_votes_batch])
            print (counter_error)
            users_with_votes_batch = list()

if len(users_with_votes_batch) > 0:
    with open(users_with_votes_file, 'a') as f:
        f.writelines([str(u)+'\n' for u in users_with_votes_batch])
        print (counter_error)
        users_with_votes_batch = list()

In [None]:
counter_max_page_id

In [None]:
counter_userid

In [None]:
counter_error

### Соберём кумулятивную информацию (хотя это можно было в предыдущем разделе сделать)

counter_count_for_bday = collections.Counter()

%%time 

for birth_day_url in reversed(birth_url_generator()):
    try:
        current_html = get_html(requests.get(birth_day_url))
        max_page_id = max((int(fetch_url.split('/')[-2]) 
                           for fetch_url in current_html.xpath("//a[contains(@href,'/community/birth_day/')]/@href")))
        counter_max_page_id[birth_day_url] = max_page_id

        count_bday = current_html.xpath("//h1[@class='level2' and contains(text(),'День рождения')]")[0].text.split()[-1]
#         count_bday = count_bday[1:-1]
        counter_count_for_bday[birth_day_url] = count_bday
    except Exception as e:
        print ("ERROR for %s: %s" % (birth_day_url,str(e)))
        counter_error [str(e)] += 1

In [12]:
counter_error

Counter()

In [16]:
counter_max_page_id.most_common()

[('https://www.kinopoisk.ru/community/birth_day/10-1/', 22891),
 ('https://www.kinopoisk.ru/community/birth_day/12-31/', 810),
 ('https://www.kinopoisk.ru/community/birth_day/11-30/', 597),
 ('https://www.kinopoisk.ru/community/birth_day/10-10/', 550),
 ('https://www.kinopoisk.ru/community/birth_day/10-2/', 537),
 ('https://www.kinopoisk.ru/community/birth_day/6-10/', 534),
 ('https://www.kinopoisk.ru/community/birth_day/1-10/', 533),
 ('https://www.kinopoisk.ru/community/birth_day/11-8/', 528),
 ('https://www.kinopoisk.ru/community/birth_day/7-10/', 528),
 ('https://www.kinopoisk.ru/community/birth_day/1-18/', 528),
 ('https://www.kinopoisk.ru/community/birth_day/11-9/', 526),
 ('https://www.kinopoisk.ru/community/birth_day/8-10/', 526),
 ('https://www.kinopoisk.ru/community/birth_day/1-19/', 526),
 ('https://www.kinopoisk.ru/community/birth_day/11-1/', 525),
 ('https://www.kinopoisk.ru/community/birth_day/1-11/', 525),
 ('https://www.kinopoisk.ru/community/birth_day/5-15/', 521),
 ('

In [20]:
for url, c in counter_count_for_bday.items():
    counter_count_for_bday[url] = int(c[1:-1])

In [25]:
counter_count_for_bday.most_common()

[('https://www.kinopoisk.ru/community/birth_day/10-1/', 572258),
 ('https://www.kinopoisk.ru/community/birth_day/12-31/', 20249),
 ('https://www.kinopoisk.ru/community/birth_day/11-30/', 14923),
 ('https://www.kinopoisk.ru/community/birth_day/10-10/', 13735),
 ('https://www.kinopoisk.ru/community/birth_day/10-2/', 13411),
 ('https://www.kinopoisk.ru/community/birth_day/6-10/', 13348),
 ('https://www.kinopoisk.ru/community/birth_day/1-10/', 13307),
 ('https://www.kinopoisk.ru/community/birth_day/7-10/', 13185),
 ('https://www.kinopoisk.ru/community/birth_day/11-8/', 13181),
 ('https://www.kinopoisk.ru/community/birth_day/1-18/', 13181),
 ('https://www.kinopoisk.ru/community/birth_day/8-10/', 13141),
 ('https://www.kinopoisk.ru/community/birth_day/11-9/', 13138),
 ('https://www.kinopoisk.ru/community/birth_day/1-19/', 13138),
 ('https://www.kinopoisk.ru/community/birth_day/11-1/', 13117),
 ('https://www.kinopoisk.ru/community/birth_day/1-11/', 13117),
 ('https://www.kinopoisk.ru/communit

## Всего users  (с учётом тех, кто не ставил оценки и не просматривал фильмы)

In [24]:
sum(counter_count_for_bday.values())

3981306

## Количество users, которые, возможно, ставили оценки

In [6]:
with open(users_with_votes_file, 'r') as f:
    all_users = {int(l.strip()) for l in f.readlines()}
len(all_users)

1066301