# Analiza pozyskanych danych

In [1]:
import os
import json
import pandas as pd
from os.path import abspath, basename, join, splitext
from consts import parties_color_map, query_name_mapping


## 1. Liczba  i rozkład tweetów

In [8]:
tweets_path = "../results"

parties_stats = {}
previous_num_unique_tweets = 0
all_tweets = set()

for dirpath, dirnames, files in os.walk(tweets_path):
    sum_tweets = 0
    for file_path in files:
        file_path = os.path.join(dirpath, file_path)
        with open(file_path) as file:
            tweets = [json.loads(line)["tweet"] for line in file]
            sum_tweets += len(tweets)
            all_tweets.update(tweets)
    party_name = splitext(basename(dirpath))[0]
    party_name = query_name_mapping.get(party_name, party_name)
    parties_stats[party_name] = (sum_tweets, len(all_tweets) - previous_num_unique_tweets)
    previous_num_unique_tweets = len(all_tweets)

del parties_stats['results']

In [11]:
sum_all = sum([x[0] for x in parties_stats.values()])
sum_unique = sum([x[1] for x in parties_stats.values()])

def print_nice(tab):
    l = [' ' + x if i % 3 == 2 else x for i,x in enumerate(str(tab)[::-1])]
    l = l[::-1]
    return "".join(l)


parties_stats_in_lists = [list(t) for t in list(zip(*parties_stats.values()))]
all_t = parties_stats_in_lists[0]
unique_t = parties_stats_in_lists[1]
d = {'Partia': list(parties_stats.keys()), 'Wszystkie tweety': all_t , 'Unikalne tweety': unique_t}
df = pd.DataFrame(data=d)
df

Unnamed: 0,Partia,Wszystkie tweety,Unikalne tweety
0,Koalicja Obywatelska,154099,110907
1,Koalicja Polska,8521,5477
2,Konfederacja,51690,45162
3,Kukiz,1500,1302
4,Lewica,34304,29471
5,PiS,125030,107632
6,Polska 2050,11915,7240
7,Polskie Sprawy,1098,924
8,Porozumienie,648,425
9,PPS,3225,2659


In [10]:
print(f"Wszystkie tweety: {print_nice(sum_all)}")
print(f"Unikalne tweety: {print_nice(sum_unique)}")

Wszystkie tweety:  392 030
Unikalne tweety:  311 199


In [12]:
import plotly.express as px
from consts import parties_color_map
fig = px.pie(df, values='Unikalne tweety', names='Partia', title='Rozkład partii w zbiorze tweetów', color="Partia", color_discrete_map=parties_color_map)

fig.show()

In [13]:
fig = px.pie(df, values='Unikalne tweety', names='Partia', title='Rozkład partii w zbiorze tweetów', color="Partia", color_discrete_map=parties_color_map)
fig.update_traces(textposition='inside', textinfo='percent+label')
fig.show()

## 2. Lokalizacje użytkowników

In [14]:
locations = []

with open("../datasets/unique-users-info/users-all-unique.jl", "r") as file:
        all_users = 0
        for line in file:
            line = json.loads(line)

            location = (line)["location"]
            if location:
                locations.append(location)

            all_users += 1

print(f"Unikalni użytkownicy: {print_nice(all_users)}")
print(f"Liczba lokalizacji: {print_nice((len(locations)))}")

locations = list(set(locations))
print(f"Liczba unikalnych lokalizacji: {print_nice(len(locations))}")



Unikalni użytkownicy: 42 146
Liczba lokalizacji: 17 655
Liczba unikalnych lokalizacji: 5 560


In [15]:
import geocoder

def get_coords(name):
    try:
        g = geocoder.osm(name)

        if g and g.json['country'] == 'Polska':   # filters dump names like "better part of the World" or "G....."
            return g.json['lat'], g.json['lng']

        elif len(name.split()) > 1:             # retries with only first name
           return get_coords(name.split()[0])
        
        else:
            return None
    except:
        if len(name.split()) > 1:
            return get_coords(name.split()[0])
        else:
            return None

## 2.1 Test geocodera

In [17]:
# test
locations_coords = [get_coords(loc) for loc in locations[:10]]

for i, loc in enumerate(locations[:10]):
    print(f"{loc} --> {(locations_coords[i])}")

Wolska --> (52.2317757, 20.9589781)
Stolica disco-polo, Lemingrad --> None
moravia --> None
HH --> None
Stargard gmina  --> (53.2829538, 15.33305619859432)
Visegrad Region --> None
Brzesko, Polska --> (49.9678396, 20.6068496)
Motherwell, Scotland --> None
Greece --> None
Skoczów --> (49.7999039, 18.7877583)


## 2.2 Poprawne lokalizacje

In [76]:
import pickle

from tqdm import tqdm
from os.path import exists

LOOK_UP_PATH = './locations_look_up.pickle'

if(exists(LOOK_UP_PATH)):
    with open(LOOK_UP_PATH, 'rb') as handle:
        locations_look_up = pickle.load(handle)
else:
    locations_look_up = {}
    for location in tqdm(locations):
        coords = get_coords(location)
        if coords:
            locations_look_up[location] = coords

    with open(LOOK_UP_PATH, 'wb') as handle:
        pickle.dump(locations_look_up, handle, protocol=pickle.HIGHEST_PROTOCOL)


print(f"Poprawnych lokalizacji (w Polsce): {len(locations_look_up)} \n\nPrzykłady:")

for loc, coords in list(locations_look_up.items())[0:20]:
    print(f"{loc} --> {coords}")

Poprawnych lokalizacji (w Polsce): 2841 

Przykłady:
Jawór  --> (51.067702100000005, 16.197613635866436)
Piekło Dolne, gmina Wygwizdów  --> (53.79043045, 19.71753003227758)
Kielce/Świat --> (50.8746431, 20.6299195)
Śląsk / Poznań --> (49.9716667, 19.0897222)
gdynia  --> (54.5164982, 18.5402738)
Marki --> (52.336375200000006, 21.120778759940738)
Wilkowyje, Polska --> (50.147191, 18.9515894)
Łańcut  --> (50.069594949999995, 22.233505598976215)
San Escobar  --> (53.1239425, 18.0006012)
Górny Śląsk  --> (50.6680553, 23.7721344)
Warszawa, Dublin, Nowy Jork --> (52.2319581, 21.0067249)
53.121750, 17.995540 --> (53.1217698, 17.9948495)
Legnica --> (51.204768900000005, 16.174675034638256)
PL / WAW --> (52.163068100000004, 20.970804316016185)
Serock --> (52.5135193, 21.0731162)
Magdalenòw, gmina Szczerców --> (51.35964, 19.15561)
Dobra (Szczecińska), Polska --> (53.4868077, 14.3857925)
Oświęcim / Kraków --> (50.0336707, 19.260079835793633)
Polska  ur, Zielona Góra. --> (52.215933, 19.134422)
Ma

In [77]:
# filter coords indicating whole country: Poland

poland_cords = locations_look_up["Polska"]
print(f'Poland/Polska --> {poland_cords}')

print(f'Przed filtrowaniem: {len(locations_look_up)}')
locations_look_up = {k: v for k, v in locations_look_up.items() if v != poland_cords}
print(f'Po filtrowaniu: {len(locations_look_up)}')

Poland/Polska --> (52.215933, 19.134422)
Przed filtrowaniem: 2841
Po filtrowaniu: 2678


In [78]:
# add location info from posts

df_tweets_with_cords = df_tweets_tagged[df_tweets_tagged["place.coordinates"].notnull()].loc[:, ["user_id", "place.coordinates"]]
for _, row in df_tweets_with_cords.iterrows():
    userd_id = row[0]
    coords = tuple(row[1])
    locations_look_up[user_id] = coords

print(f'Po dodaniu lokacji przypisanej do postów: {len(locations_look_up)}')

Po dodaniu lokacji przypisanej do postów: 2679


## 2.3 Liczba tweetów z przyporządkowaną poprawną lokalizacją

In [79]:
user_locations_lookup = {}

with open("../datasets/unique-users-info/users-all-unique.jl") as file:
    for line in file:
        line = json.loads(line)
        user_loc = line["location"]
        if user_loc in locations_look_up:
            user_id = line["id"] 
            user_locations_lookup[user_id] = locations_look_up[user_loc]

print(f"\nUżytkowników z poprawną lokalizacją: {print_nice(len(user_locations_lookup))} ({(100 * len(user_locations_lookup) / all_users):.4}%) \n\nPrzykłady:")

for user, loc in list(user_locations_lookup.items())[:3]:
    print(f'User_id:{user:>20}   location: {loc}')




Użytkowników z poprawną lokalizacją: 10 812 (25.65%) 

Przykłady:
User_id: 1418618065969991686   location: (50.2780834, 19.1342944)
User_id: 1150688690387279872   location: (50.2780834, 19.1342944)
User_id:          2435457325   location: (50.2780834, 19.1342944)


In [81]:
df_user_loc = pd.DataFrame(user_locations_lookup.items(), columns=["user_id","user_loc"])
df_user_loc

Unnamed: 0,user_id,user_loc
0,1418618065969991686,"(50.2780834, 19.1342944)"
1,1150688690387279872,"(50.2780834, 19.1342944)"
2,2435457325,"(50.2780834, 19.1342944)"
3,918974704098185216,"(50.721273, 23.259588703837892)"
4,732565121814409217,"(50.0728604, 19.9349365)"
...,...,...
10807,523534275,"(51.1089776, 17.0326689)"
10808,796750430474747906,"(54.36119285, 18.62860883362069)"
10809,1470090710804217858,"(50.5687422, 19.2343995)"
10810,914827806718222336,"(52.2319581, 21.0067249)"


In [90]:
from helpers import read_jl_to_dataframe

df_tweets_tagged = read_jl_to_dataframe("../datasets/all_tagged.jl")
df_tagged_loc = df_tweets_tagged.merge(df_user_loc, on="user_id")
df_tagged_loc.shape

with open("all_tagged_with_location.pickle", "wb") as handle:
    pickle.dump(df_tagged_loc, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [93]:
import plotly.express as px

data = {
    "Names": ["Z lokalizacją", "Brak lokalizacji"],
    "Percentage": [
        df_tagged_loc.shape[0] / (df_tweets_tagged.shape[0]),
        1 - df_tagged_loc.shape[0] / df_tweets_tagged.shape[0],
    ],
}

df = pd.DataFrame(data)

fig = px.pie(
    df,
    values="Percentage",
    names="Names",
    title="Tweety zawierające poprawną lokalizację",
    color=df.index,
)

fig.show()