In [1]:
import os
os.chdir('/Users/hsi/projects/data_course/vk_group')

if not os.path.exists('out'):
    os.mkdir('out')


In [2]:
import requests
import json


class VkApi:
    url = 'https://api.vk.com/method/'
    
    def __init__(self, token, version):
        self.token = token
        self.version = version
        
    def get_group_members(self, group_id, offset=None):
        data = {
            'group_id': group_id,
            'fields': 'sex, bdate',
            'offset': offset
        }
        return self.__get_method('groups.getMembers', data)['items']

    def get_group_size(self, group_id):
        data = {
            'group_id': group_id,
            'count': 1
        }
        return self.__get_method('groups.getMembers', data)['count']

    def get_user_interests(self, user_ids):
        data = {
            'user_ids': user_ids,
            'fields': 'interests'
        }
        return self.__get_method('users.get', data)

    def __get_method(self, method, data):
        response = requests.post(
            self.url + method,
            data={
                **data,
                'access_token': self.token,
                'v': self.version,
            }
        )
        response = json.loads(response.text)
        error = response.get('error')
        if error:
            raise Exception(error['error_msg'])
        return response['response']



In [3]:
import pandas as pd


def pandas_df(foo):
    def wrapped(*args, **kwargs):
        data = foo(*args, **kwargs)
        return pd.DataFrame.from_dict(data).set_index('id')
    return wrapped


class VkApiPandas(VkApi):
    
    @pandas_df
    def get_group_members(self, group_id, offset=None):
        return super().get_group_members(group_id, offset)

    @pandas_df
    def get_user_interests(self, user_ids):
        return super().get_user_interests(user_ids)
    
    def get_members_with_interests(self, group_id, offset=None):
        members = self.get_group_members(group_id, offset)
        user_ids = members.index.astype('str').str.cat(sep=',')
        interests = self.get_user_interests(user_ids)
        return members.join(interests['interests'])
    
    def get_members_with_interests_all(self, group_id):
        members = self.get_members_with_interests(group_id, 0)
        members_per_request = members.shape[0]
        group_size = self.get_group_size(group_id)
        
        df_list = [members]
        for offset in range(members_per_request, group_size, members_per_request):
            df_list.append(
                self.get_members_with_interests(group_id, offset),
            )
        members = pd.concat(df_list, verify_integrity=True)
        return members.replace('', None)


In [5]:
access_token = 'enter your vk access token'
vk_api = VkApiPandas(access_token, '5.67')
group_id = 'm_gandhi'

members = vk_api.get_members_with_interests_all(group_id)
members.to_csv('out/group_members.csv', header=True)
members.shape


(7217, 6)

In [6]:
members = pd.read_csv('out/group_members.csv').set_index('id')
members = members[~members['deactivated'].isin(['deleted', 'banned'])]
print(members.shape)
members.head()


(5231, 6)


Unnamed: 0_level_0,bdate,deactivated,first_name,last_name,sex,interests
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
15878,2.2.1989,,Yulchik,Kleymyonova,1,
18120,2.6,,Yulia,Podtikhova,1,
22018,15.3,,Olga,Rubia,1,
25519,25.4,,Anastasia,Rodina,1,
35290,14.10.1986,,Elena,Grimani,1,"James Ensor, Emil Cioran, Le Chat, Andy Warhol..."


In [7]:
gender = members['sex'].value_counts()
gender = {
    'female': gender[1],
    'male': gender[2],
    '?': gender[0],
}
gender


{'female': 2738, 'male': 2490, '?': 3}

In [9]:
from datetime import datetime


def calculate_age(born):
    from_day = datetime.strptime('1.08.2017', '%d.%m.%Y').date()
    return from_day.year - born.year - ((from_day.month, from_day.day) < (born.month, born.day))


age = members['bdate'].dropna()
age = age[age.astype('str').map(len) > 6]
age = pd.to_datetime(age, format='%d.%m.%Y')
members['age'] = age.apply(calculate_age)

oldest = members['age'].max()
age = pd.cut(
    members['age'],
    [0, 10, 20, 30, oldest],
    labels=['<=10', '11-20', '21-30', '>=31']
).value_counts()
age = {
    '<=10': age['<=10'],
    '11-20': age['11-20'],
    '21-30': age['21-30'],
    '>=31': age['>=31'],
    '?': members['age'].isna().sum()
}
age


{'<=10': 0, '11-20': 240, '21-30': 980, '>=31': 530, '?': 3481}

In [10]:
import re


def format_interest(raw_interest):
    if raw_interest == 'nan':
        return None
    interest = raw_interest.lower()
    interest = re.sub(r'[^а-яa-zё\s\'\"]+|\n+', ',', interest)
    interest = re.sub(r'\s+', ' ', interest)
    interest = re.sub(r'\s,', ',', interest)
    interest = re.sub(r',\s', ',', interest)
    interest = re.sub(r',+', ',', interest)
    interest = re.sub(r',$', '', interest)
    interest = pd.Series(interest.split(',')).drop_duplicates()
    interest = interest[interest.astype('str').map(len) > 3]
    return interest 


interests = members['interests'].astype('str').apply(format_interest)
interests = pd.concat(list(interests))
interests.value_counts()[:10]


музыка          78
спорт           63
психология      57
путешествия     54
философия       43
саморазвитие    37
книги           32
природа         31
история         31
литература      25
dtype: int64

In [11]:
top_interest = interests.describe()['top']
top_interest


'музыка'

In [12]:
def format_dict(dict_):
    return {key: int(dict_[key]) for key in dict_.keys()}


result = {
    'gender': format_dict(gender),
    'age': format_dict(age),
    'top_interest': top_interest
}
result


{'gender': {'female': 2738, 'male': 2490, '?': 3},
 'age': {'<=10': 0, '11-20': 240, '21-30': 980, '>=31': 530, '?': 3481},
 'top_interest': 'музыка'}

In [13]:
with open('out/group.json', 'w') as fp:
    json.dump(result, fp, ensure_ascii=False, indent=4)
