In [29]:
import requests
from requests.exceptions import HTTPError, Timeout
import re
import json
import os
from datetime import date, datetime
import matplotlib.pyplot as plt

In [30]:
def get_age(birthday):
    today = date.today()
    agestr = today.year - birthday.year
    if today.month < birthday.month:
        agestr -= 1
    elif today.month == birthday.month and today.day < birthday.day:
        agestr -= 1
    return agestr

def make_string(status, post_id, poster_id, first_name, last_name,
                 sex, city, birthday, age, education, text):
    length = len(re.findall("\w+-*\w*", text))
    string = '%s\t%s\t%st%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t"%s"' % (
        status, post_id, poster_id, first_name, last_name,
        sex, city, birthday, age, education, length, text)
    return string

def assemble_tsv(line):
    header = 'status,post_id,owner_id,first_name,last_name,sex,city,' \
             'birthday,age,education,length,text,comment_id\n'
    if not os.path.exists('gos_data.tsv'):
        with open('gos_data.tsv', 'w', encoding='utf-8') as f:
            f.write(header)
    with open('gos_data.tsv', 'a', encoding='utf-8') as f:
        f.write(line)

In [31]:
def get_poster_info(poster_id, token):
    if poster_id < 0:
        (first_name, last_name, sex, city, birthday,
         age, education) = None, None, None, None, None, None, None
        return first_name, last_name, sex, city, birthday, age, education
    parameters = {'access_token': token, 'v': '5.95', 'user_id': poster_id,
              'fields': 'city,sex,first_name,last_name,bdate,education'}
    try:
        req = requests.get('https://api.vk.com/method/users.get',
                           params=parameters)
        data = req.text
        req.raise_for_status()
    except HTTPError or Timeout:
        print('Broken User Info')
        return None
    else:
        data = json.loads(data)
        try:
            items = data['response'][0]
        except KeyError:
            return None
        else:
            first_name = items['first_name']
            last_name = items['last_name']
            if items['sex'] == 1:
                sex = 'female'
            elif items['sex'] == 2:
                sex = 'male'
            else:
                sex = 'unspecified'
            try:
                city = items['city']['title']
            except KeyError:
                city = None
            try:
                birthday = items['bdate']
            except KeyError:
                birthday = None
                age = None
            else:
                try:
                    day, month, year = birthday.split('.')
                except ValueError:
                    age = None
                else:
                    time_string = day + '/' + month + '/' + year
                    birthday = datetime.strptime(time_string, '%d/%m/%Y')
                    age = get_age(birthday)
            try:
                education = items['university_name']
            except KeyError:
                education = None
    return first_name, last_name, sex, city, birthday, age, education

In [32]:
def get_comments(post_id, community_id, token):
    comments = []
    info = []
    offset = 0
    while True:
        if offset:
            parameters = {'access_token': token, 'v': '5.95',
                      'owner_id': community_id, 'post_id': post_id,
                      'count': 100, 'offset': offset}
        else:
            parameters = {'access_token': token, 'v': '5.95',
                      'owner_id': community_id, 'post_id': post_id,
                      'count': 100}
        try:
            req = requests.get('https://api.vk.com/method/wall.getComments',
                               params=parameters)
            data = req.text
            req.raise_for_status() 
        except HTTPError or Timeout:
            print('Could not collect comments')
        else:
            status = 'comment'
            data = json.loads(data)
            items = data['response']['items']
            if not items: 
                break
            offset += 100
            for item in items:
                try:
                    text = item['text']
                except KeyError:
                    text = None
                comment_id = item['id']
                try:
                    poster_id = item['from_id']
                except KeyError:
                    pass
                else:
                    (first_name, last_name, sex, city, birthday,
                     age, education) = get_poster_info(poster_id, token)
                    comments.append(comment_id)
                    meta = make_string(status, post_id, poster_id,
                                        first_name, last_name, sex,
                                        city, birthday, age, education, text)
                    meta += '\t' + str(comment_id) + '\n'
                    info.append(meta)
    return comments, info

In [33]:
def get_posts():
    offset = 0
    token = 'e320ee05e320ee05e320ee0571e34953c8ee' \
            '320e320ee05bfed770b96bf1498649f1e9c'
    while True:
        print('Posts collected:', offset)
        token = 'e320ee05e320ee05e320ee0571e34953c8ee' \
                '320e320ee05bfed770b96bf1498649f1e9c'
        parameters = {'access_token': token, 'domain': 'gosbooking',
                  'v': '5.95', 'offset': offset, 'count': '100'}
        try:
            req = requests.get('https://api.vk.com/method/wall.get',
                               params=parameters)
            data = req.text
            req.raise_for_status()
        except HTTPError or Timeout:
            print('Could not collect posts')
        else:
            data = json.loads(data)
            items = data['response']['items']
            if not items:
                break
            for item in items:
                post_id = item['id']
                community_id = item['owner_id']
                text = item['text'].replace('\n', '\\n')
                text = re.sub('\[.+\|(.+)\]', '\1', text)
                try:
                    poster_id = item['signer_id']
                except KeyError:
                    poster_id = community_id
                (first_name, last_name, sex, city, birthday,
                     age, education) = get_poster_info(poster_id, token)
                status = 'post'
                comments, info = get_comments(post_id,
                                              community_id, token)
                post = make_string(status, post_id, poster_id, 
                                   first_name, last_name, sex, city, 
                                   birthday, age, education, text)
                post += '\t' + str(comments) + '\n'
                assemble_tsv(post)
                for meta in info:
                    assemble_tsv(meta)
            offset += 100
    print('Done')


get_posts()

Posts collected: 0
Posts collected: 100
Posts collected: 200
Posts collected: 300
Posts collected: 400
Done
