# Applied Project in Big Data on Industrial Dataset

## DATA COLLECTION TECHNIQUES
## Part IV. Get data from VK API

### 1. Libraries and credentials

In [None]:
import os
import json
import requests

In [None]:
def access_data(file_path):
    with open(file_path) as file:
        access_data = json.load(file)
    return access_data

creds = access_data(file_path='access_vkapi.json')
print(creds.keys())

In [None]:
VER = '5.126'
TOKEN = creds['token']

# Let's start from two GSOM groups in VK
URL_SPBU = 'https://vk.com/gsom.spbu'
URL_ABTR = 'https://vk.com/gsom_abiturient'
GROUP_SPBU = 'gsom.spbu'
GROUP_ABTR = 'gsom_abiturient'

### 2. VK API first steps

Complete manual for VK API is [here](https://dev.vk.com/api/getting-started).

#### 2.1. Groups' descriptions

In [None]:
groups = GROUP_SPBU +',' + GROUP_ABTR
method = 'groups.getById'
fields = 'city,country,place,description,members_count'
# all fields are here: https://vk.com/dev/objects/group

In [None]:
url = ''.join([
    f'https://api.vk.com/method/{method}?',
    f'group_ids={groups}',
    f'&fields={fields}',
    f'&access_token={TOKEN}',
    f'&v={VER}'
])
print(url)

In [None]:
r = requests.get(url)

In [None]:
type(r)

In [None]:
r.text

In [None]:
r.json()

In [None]:
data = r.json()
type(data)

In [None]:
data.keys()

In [None]:
type(data['response'])

In [None]:
len(data['response'])

In [None]:
data['response'][0].keys()

In [None]:
data['response'][1].keys()

In [None]:
for item in data['response']:
    print('=' * 70)
    for k, v in item.items():
        print('-' * 70)
        print('key:', k)
        print('value:', v)

We are going to collect data from `gsom_ma` VK group and we will need to get `id` of that group for further data:

In [None]:
groups = 'gsom_ma'
method = 'groups.getById'
fields = 'city,country,place,description,members_count'

In [None]:
url = ''.join([
    f'https://api.vk.com/method/{method}?',
    f'group_ids={groups}',
    f'&fields={fields}',
    f'&access_token={TOKEN}',
    f'&v={VER}'
])

In [None]:
r = requests.get(url)
data = r.json()

In [None]:
id_gsom_ma = data['response'][0]['id']
print(id_gsom_ma)

#### 2.2. Get data on walls

Here we can get all records from the wall of the selected group. The method [wall.get](https://dev.vk.com/method/wall.get) is used for that purpose.

In [None]:
owner_id = id_gsom_ma
offset = 0 # to start with
count = 5 # how many recoeds to get
method = 'wall.get'

In [None]:
url = ''.join([
    f'https://api.vk.com/method/{method}?',
    f'owner_id=-{owner_id}',
    f'&offset={offset}',
    f'&count={count}',
    f'&access_token={TOKEN}',
    f'&v={VER}'
])
r = requests.get(url)
data = r.json()

In [None]:
data.keys()

In [None]:
len(data['response'])

In [None]:
data['response'].keys()

In [None]:
# how many records are on the wall
data['response']['count']

In [None]:
# data on records itself
len(data['response']['items'])

In [None]:
data['response']['items'][0]

In [None]:
# NOTE: date as Unix timestamp

data['response']['items'][0]['date']

In [None]:
import datetime
date = datetime.datetime.fromtimestamp(data['response']['items'][0]['date'])
date

In [None]:
for item in data['response']['items']:
    print('=' * 70)
    print('id:', item['id'], 
          '| date:', datetime.datetime.fromtimestamp(item['date']), 
          '\n', item['text'], '\n')

Now let's draw a barplot for posts' views that depend on time. So the task is:
1. Load from 10 to 20 records (posts) from the group
2. Extract data on posts' views (`views` and `count` keys), and the timestamps for the posts (`date` key)
3. Draw a barplot for `views` depending on time

In [None]:
owner_id = id_gsom_ma
offset = 0 # to start with
count = 30 # how many recoeds to get
method = 'wall.get'
url = ''.join([
    f'https://api.vk.com/method/{method}?',
    f'owner_id=-{owner_id}',
    f'&offset={offset}',
    f'&count={count}',
    f'&access_token={TOKEN}',
    f'&v={VER}'
])
r = requests.get(url)
data = r.json()

In [None]:
dates = []
views = []
for item in data['response']['items']:
    dates.append(datetime.datetime.fromtimestamp(item['date']))
    views.append(item['views']['count'])

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(16, 4))
plt.bar(dates, views)
plt.xticks(dates, rotation='vertical')
plt.show()

### 3. VK API headhunt

We can get the data not only on the groups but also on the members of that groups:
- method [`getMembers`](https://dev.vk.com/method/groups.getMembers) to get list of members
- method [`users/get`](https://dev.vk.com/method/users.get) to get the data on specified users

#### 3.1. Get all members of the group

In [None]:
group_name = 'gsom_ma'
offset = 0 # to start from
count = 100 # number of members to get their ids
method = 'groups.getMembers'

In [None]:
url = ''.join([
    f'https://api.vk.com/method/{method}?',
    f'group_id={group_name}',
    f'&offset={offset}',
    f'&count={count}',
    f'&access_token={TOKEN}',
    f'&v={VER}'
])
print(url)

In [None]:
r = requests.get(url)
data = r.json()
print(
    'group:', group_name,
    '| total members:', data['response']['count'], 
    '| loaded:', len(data['response']['items'])
)

In [None]:
data['response']

In [None]:
list_of_members = data['response']['items']
print(len(list_of_members))

In [None]:
list_of_members[:10]

#### 3.2. Get data on the one member

In [None]:
group_name = 'gsom.spbu'
method = 'users.get'
user = '1212'

In [None]:
# all fields https://vk.com/dev/objects/user
fields = 'sex,bdate,city,country,home_town,education,universities,schools,status,last_seen,occupation'

In [None]:
url = ''.join([
    f'https://api.vk.com/method/{method}?',
    f'user_ids={user}',
    f'&fields={fields}',
    f'&access_token={TOKEN}',
    f'&v={VER}'
])
print(url)

In [None]:
r = requests.get(url)
data = r.json()

In [None]:
data

#### 3.3. Get data on the wall

The same method as for the groups!

In [None]:
owner_id = '1212'
offset = 0
count = 20
method = 'wall.get'

In [None]:
url = ''.join([
    f'https://api.vk.com/method/{method}?',
    f'owner_id=-{owner_id}',
    f'&offset={offset}',
    f'&count={count}',
    f'&access_token={TOKEN}',
    f'&v={VER}'
])
r = requests.get(url)
data = r.json()

In [None]:
data

In [None]:
folder = 'walls_data'
os.makedirs(folder, exist_ok=True)

In [None]:
import time
from tqdm.auto import tqdm
from random import uniform

offset = 0
count = 20
method = 'wall.get'

for user_id in tqdm(list_of_members):
    url = ''.join([
        f'https://api.vk.com/method/{method}?',
        f'owner_id=-{user_id}',
        f'&offset={offset}',
        f'&count={count}',
        f'&access_token={TOKEN}',
        f'&v={VER}'
    ])
    r = requests.get(url)
    data = r.json()
    if 'response' in data.keys():
        file_path = f'{folder}/{user_id}.json'
        with open(file_path, 'w') as file:
            json.dump(data, file)
    else:
        print(data.keys())
    time.sleep(uniform(.1, 1.1))

#### 3.3. Process data on the wall

In [None]:
file_list = os.listdir(folder)

In [None]:
import pandas as pd

dfs = [] # an empty list to store the data frames
for file_name in file_list:
    if 'json' in file_name:
        data = pd.read_json(
            f'{folder}/{file_name}', 
            lines=True
        ) # read data frame from json file
    dfs.append(data) # append the data frame to the list
df = pd.concat(dfs, ignore_index=True) # concatenate all the data frames in the list.
df.head()

In [None]:
all_data = []
for file_name in file_list:
    if 'json' in file_name:
        with open(f'{folder}/{file_name}') as file:
            data = json.load(file)
        all_data.extend(data['response']['items'])
df = pd.DataFrame(all_data)
df.head()

In [None]:
pd.set_option('display.max_columns', None)

all_data = []
for file_name in file_list:
    if 'json' in file_name:
        with open(f'{folder}/{file_name}') as file:
            data = json.load(file)
        all_data.append(pd.json_normalize(data['response']['items']))
df = pd.concat(all_data, ignore_index=True) # concatenate all the data frames in the list.
df.head()

In [None]:
df['owner_id'] = df['owner_id'].astype(str)

In [None]:
df.groupby(by='owner_id')['likes.count'].sum()

In [None]:
plt.figure(figsize=(16, 4))
df.groupby(by='owner_id')['likes.count'].sum().plot(kind='bar')
plt.xticks(rotation='vertical')
plt.show()

We have already seen how to get data jn one member of the group with metod `users.get`, but this method can take many members' ids at once:

In [None]:
# let's take ALL the members
list_of_members

In [None]:
# all fields that can be collected
# see at manual https://vk.com/dev/objects/user

group_name = 'gsom.spbu'
method = 'users.get'
users = ','.join([str(x) for x in list_of_members])
fields = 'sex,bdate,city,country,home_town,education,universities,schools,status,last_seen,occupation'
url = ''.join([
    f'https://api.vk.com/method/{method}?',
    f'user_ids={users}',
    f'&fields={fields}',
    f'&access_token={TOKEN}',
    f'&v={VER}'
])
r = requests.get(url)
data = r.json()

In [None]:
for item in data['response']:
    print('-' * 70)
    print(
        'id:', item['id'],
        '\nfirst name:', item['first_name'], 
        '\nlast name:', item['last_name']
    )
    try:
        print(
            '\nbirth date:', item['bdate'], 
            '\nhome town:', item['home_town'], 
            '\nstatus:', item['status']
        )
    except:
        print('no more data')

In [None]:
univers = [x['universities'] for x in data['response'] if 'universities' in x.keys()]
univers = [item for sublist in univers for item in sublist]
univers = [x['name'] for x in univers]
univers

In [None]:
len(set(univers))

In [None]:
from collections import Counter

In [None]:
freqs = dict(Counter(univers))

In [None]:
freqs = dict(
    sorted(
        freqs.items(), 
        key=lambda item: item[1], 
        reverse=True
    )
)
freqs

In [None]:
plt.figure(figsize=(16, 6))
plt.bar(*zip(*freqs.items()))
plt.xticks(rotation='vertical')
plt.show()