In [1]:
import json
import os
import sys
import time

In [2]:
from common.main_mapping import *
from common.utils import *

In [3]:
module_path = os.path.abspath(os.path.join('.'))
print(module_path)
if module_path not in sys.path:
    sys.path.append(module_path)

/Users/xmanatee/Desktop/alyona/analytics


# Processing and squeezing data

In [4]:
# import json
# with open("data/poll_307034744.json", "r") as f:
#     poll = json.load(f)
# poll

### Building main mappings:

In [5]:
DATA_DIR = "../data/"
DATAX_DIR = "../datax/"

In [6]:
poll_id_to_answer_id_to_user_ids, user_id_to_user = build_main_mapping(DATA_DIR)

In [7]:
with open(DATA_DIR + "user_id_to_user.json", "w") as f:
    json.dump(user_id_to_user, f)

In [8]:
with open(DATA_DIR + "poll_id_to_answer_id_to_user_ids.json", "w") as f:
    json.dump(poll_id_to_answer_id_to_user_ids, f)

Now we have variable 'polls' which is [POLL_ID \* ANSWER_ID \* USER] where ids are not real but just indexation.

# Now let's make some stats

In [9]:
CHART_ITEM_NUMBER = 5

# YESNO

### Let's find all YesOrNo questions

In [10]:
import json

with open(DATA_DIR + "group_wall.json", "r") as f:
    group_wall_json = json.load(f)

In [11]:
# group_wall_json["items"][0]

In [12]:
def clear_text(text):
    return ''.join(filter(str.isalpha, text.lower()))

In [13]:
yes_no_polls = {}

for i, wall_post in enumerate(group_wall_json["items"]):
    poll = get_poll(wall_post)
    if poll is None or poll["anonymous"]:
        continue
        
    answer_texts = list(map(lambda answer: clear_text(answer["text"]), poll["answers"]))
    if "да" in answer_texts and "нет" in answer_texts:
        
        yes_no_polls[poll["id"]] = {
            "id": poll["id"],
            "yes_id": poll["answers"][answer_texts.index("да")]["id"],
            "no_id": poll["answers"][answer_texts.index("нет")]["id"]
        }
        

In [14]:
print("Number of yes/no questions: {}".format(len(yes_no_polls)))

Number of yes/no questions: 145


In [15]:
os.makedirs(DATAX_DIR, exist_ok=True)

### Now let's find out who are YESNO people

In [16]:
from collections import defaultdict

yes_cnt = defaultdict(int)
no_cnt = defaultdict(int)

for yes_no_poll in yes_no_polls.values():
    poll_filepath = DATA_DIR + "poll_{}.json".format(yes_no_poll["id"])

    try:
        with open(poll_filepath, "r") as f:
            poll = json.load(f)
    except FileNotFoundError as error:
        print(error)
        continue

    if not isinstance(poll, list):
        print("error")
    
    answers_to_ids = []
    for answer in poll:
        if answer["answer_id"] == yes_no_poll["yes_id"]:
            for user in answer["users"]["items"]:
                yes_cnt[user["id"]] += 1
        if answer["answer_id"] == yes_no_poll["no_id"]:
            for user in answer["users"]["items"]:
                no_cnt[user["id"]] += 1


In [17]:
def get_max_cnt(cnt_dict):
    return list(filter(lambda key: cnt_dict[key] == max(cnt_dict.values()), cnt_dict))

In [18]:
def get_max_cnts(cnt_dict):
    return list(map(lambda i: i[0], sorted(cnt_dict.items(), key=lambda i: i[1], reverse=True)[:CHART_ITEM_NUMBER]))

In [19]:
def id_to_name(user_id):
    user = user_id_to_user[user_id]
    return user["first_name"] + " " + user["last_name"]

def ids_to_names(user_ids):
    return list(map(id_to_name, user_ids))

In [20]:
stat = {
    "stat_id": "yes_fellas",
    "stat_icon": "icon-like",
    "stat_name": "ДАшки",
    "stat_description": "Тот самый разборчивый человек, что вечно отвечает ДА.",
    "user_ids": ids_to_names(get_max_cnts(yes_cnt))
}

with open(DATAX_DIR + "stat_yes_fellas.json", "w") as f:
    json.dump(stat, f)

In [21]:
stat = {
    "stat_id": "no_fellas",
    "stat_icon": "icon-dislike",
    "stat_name": "НЕТушки",
    "stat_description": "Тот самый другой разборчивый человек.",
    "user_ids": ids_to_names(get_max_cnts(no_cnt))
}

with open(DATAX_DIR + "stat_no_fellas.json", "w") as f:
    json.dump(stat, f)

# Make 1x1 correlation

In [22]:
import json

with open(DATA_DIR + "poll_id_to_answer_id_to_user_ids.json", "r") as f:
    poll_id_to_answer_id_to_user_ids = json.load(f)

In [23]:
user_id_to_user_id_to_ncommon_polls = {}
for poll_id in poll_id_to_answer_id_to_user_ids:
    poll_users = set()
    answer_id_to_user_ids = poll_id_to_answer_id_to_user_ids[poll_id]
    for answer_id in answer_id_to_user_ids:
        user_ids = answer_id_to_user_ids[answer_id]
        poll_users.update(user_ids)
    for user_id_1 in poll_users:
        if user_id_1 not in user_id_to_user_id_to_ncommon_polls:
            user_id_to_user_id_to_ncommon_polls[user_id_1] = {}
        for user_id_2 in poll_users:
            if user_id_2 == user_id_1:
                continue
            if user_id_2 not in user_id_to_user_id_to_ncommon_polls[user_id_1]:
                user_id_to_user_id_to_ncommon_polls[user_id_1][user_id_2] = 0
            user_id_to_user_id_to_ncommon_polls[user_id_1][user_id_2] += 1

In [24]:
user_id_to_user_id_to_ncommon_answer = {}
for poll_id in poll_id_to_answer_id_to_user_ids:
    answer_id_to_user_ids = poll_id_to_answer_id_to_user_ids[poll_id]
    for answer_id in answer_id_to_user_ids:
        user_ids = answer_id_to_user_ids[answer_id]
        for user_id_1 in user_ids:
            if user_id_1 not in user_id_to_user_id_to_ncommon_answer:
                user_id_to_user_id_to_ncommon_answer[user_id_1] = {}
            for user_id_2 in user_ids:
                if user_id_2 == user_id_1:
                    continue
                if user_id_2 not in user_id_to_user_id_to_ncommon_answer[user_id_1]:
                    user_id_to_user_id_to_ncommon_answer[user_id_1][user_id_2] = 0
                user_id_to_user_id_to_ncommon_answer[user_id_1][user_id_2] += 1

In [25]:
MIN_NUMBER_OF_COMMON_POLLS = 50

user_id_to_user_id_to_correlation = {}
for user_id_1 in user_id_to_user_id_to_ncommon_polls:
    for user_id_2 in user_id_to_user_id_to_ncommon_polls[user_id_1]:
        ncommon_polls = user_id_to_user_id_to_ncommon_polls[user_id_1][user_id_2]
        if user_id_1 not in user_id_to_user_id_to_ncommon_answer:
            continue
        if user_id_2 not in user_id_to_user_id_to_ncommon_answer[user_id_1]:
            continue
        ncommon_answer = user_id_to_user_id_to_ncommon_answer[user_id_1][user_id_2]
        if ncommon_polls > MIN_NUMBER_OF_COMMON_POLLS:
            if user_id_1 not in user_id_to_user_id_to_correlation:
                user_id_to_user_id_to_correlation[user_id_1] = {}
            if user_id_2 not in user_id_to_user_id_to_correlation[user_id_1]:
                user_id_to_user_id_to_correlation[user_id_1][user_id_2] = 1.0 * ncommon_answer / ncommon_polls

### Test:

In [26]:
print("polls : {}".format(user_id_to_user_id_to_ncommon_polls[40048641][86824543]))

print("answers : {}".format(user_id_to_user_id_to_ncommon_answer[40048641][86824543]))

print("correlation : {}".format(user_id_to_user_id_to_correlation[40048641][86824543]))

polls : 542
answers : 253
correlation : 0.466789667896679


In [27]:
print("polls : {}".format(user_id_to_user_id_to_ncommon_polls[86824543][40048641]))

print("answers : {}".format(user_id_to_user_id_to_ncommon_answer[86824543][40048641]))

print("correlation : {}".format(user_id_to_user_id_to_correlation[86824543][40048641]))

polls : 542
answers : 253
correlation : 0.466789667896679


In [28]:
with open(DATA_DIR + "user_id_to_user_id_to_correlation.json", "w") as f:
    json.dump(user_id_to_user_id_to_correlation, f)

### Finding best couple

In [29]:
def gender(user_id):
    return user_id_to_user[user_id]["sex"]

In [30]:
correlation_and_user_id_and_user_id = []

for user_id_1 in user_id_to_user_id_to_correlation:
    for user_id_2 in user_id_to_user_id_to_correlation[user_id_1]:
        if user_id_1 < user_id_2:
            continue
        # Enabling only hetero relations
        if gender(user_id_1) == gender(user_id_2):
            continue
        correlation = user_id_to_user_id_to_correlation[user_id_1][user_id_2]
        correlation_and_user_id_and_user_id.append((correlation, user_id_1, user_id_2))

In [31]:
sorted(correlation_and_user_id_and_user_id, reverse=True)[:5]

[(0.8627450980392157, 165466937, 16477927),
 (0.8181818181818182, 199904233, 115771312),
 (0.8181818181818182, 160630428, 97561390),
 (0.8064516129032258, 154129724, 111985504),
 (0.7978723404255319, 207631475, 118827849)]

In [32]:
couples = list(
    filter(
        lambda i: i[0] > i[1],
        map(
            lambda i: i[1:],
            sorted(correlation_and_user_id_and_user_id, reverse=True))))[:CHART_ITEM_NUMBER]

In [33]:
couple_names = list(map(lambda couple: id_to_name(couple[0]) + " + " + id_to_name(couple[1]), couples))

In [34]:
couple_names

['Анна Шипиль + Илья Чайковский',
 'Софья Михайлова + Александр Нехаев',
 'Анна Сунцова + Игнат Полежаев',
 'Таня Рябова + Иван Белков',
 'Лена Анюшева + Антон Ковальков']

In [35]:
stat = {
    "stat_id": "best_couples",
    "stat_icon": "icon-people",
    "stat_name": "Тыры-Пары",
    "stat_description": "Вот им есть о чем потрепаться.",
    "user_ids": couple_names,
}

with open(DATAX_DIR + "stat_best_couples.json", "w") as f:
    json.dump(stat, f)

### Storing user stats

In [36]:
import json

with open(DATA_DIR + "user_id_to_user_id_to_correlation.json", "r") as f:
    user_id_to_user_id_to_correlation = json.load(f)

In [37]:
a = 0
for user_id_1 in user_id_to_user_id_to_correlation:
    user_id_to_correlation = user_id_to_user_id_to_correlation[user_id_1]
    print("a: {} len: {}".format(user_id_1, len(user_id_to_correlation)))
    user_id_and_correlation = sorted(user_id_to_correlation.items(), key=lambda kv: kv[1], reverse=True)
    print(user_id_and_correlation[:CHART_ITEM_NUMBER])
    a += 1
    if a > 3:
        break

a: 40048641 len: 411
[('69755075', 0.6097560975609756), ('120007101', 0.5915492957746479), ('184555115', 0.5752212389380531), ('183413742', 0.5735294117647058), ('194802366', 0.5714285714285714)]
a: 362942978 len: 466
[('16477927', 0.7254901960784313), ('12509299', 0.6885245901639344), ('38189179', 0.6391752577319587), ('111985504', 0.6343283582089553), ('92567621', 0.6122448979591837)]
a: 335355653 len: 412
[('74960828', 0.6470588235294118), ('199627176', 0.6434782608695652), ('184555115', 0.6344086021505376), ('194802366', 0.631578947368421), ('199904233', 0.6176470588235294)]
a: 125667595 len: 270
[('207631475', 0.6909090909090909), ('113454893', 0.671875), ('109844869', 0.632183908045977), ('21843255', 0.6037735849056604), ('198863406', 0.6)]
