# Oznaczenie mowy ofensywnej

In [41]:
from helpers import read_jsonline
import pandas as pd
import re
from tqdm.auto import tqdm

tqdm.pandas()

df = read_jsonline("../datasets/results/all.jl")
df = df.drop_duplicates(subset='id', keep="last").reset_index(drop=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 312502 entries, 0 to 312501
Data columns (total 16 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   id                 312502 non-null  int64 
 1   conversation_id    312502 non-null  int64 
 2   datetime           312502 non-null  object
 3   user_id            312502 non-null  int64 
 4   place              312490 non-null  object
 5   tweet              312502 non-null  object
 6   lang               312502 non-null  object
 7   replies_count      312502 non-null  int64 
 8   retweets_count     312502 non-null  int64 
 9   likes_count        312502 non-null  int64 
 10  retweet            312502 non-null  bool  
 11  near               312502 non-null  object
 12  geo                312502 non-null  object
 13  hashtags           312502 non-null  object
 14  place.type         12 non-null      object
 15  place.coordinates  12 non-null      object
dtypes: bool(1), int64(6)

## Wczytanie wytrenowanego modelu BiLSTM

In [10]:
import classifier_pipeline
import fasttext
import numpy as np
import torch

vec_model = fasttext.load_model("../data/models/kgr10_orths.vec.bin")

model = torch.load("../data/models/lstm_128_1_0_0.01.pth").cpu()
model.eval()


LSTMClassifier(
  (lstm): LSTM(100, 128, batch_first=True, bidirectional=True)
  (dropout): Dropout(p=0, inplace=False)
  (fc): Linear(in_features=256, out_features=2, bias=True)
)

In [26]:
def text_to_seq(text: str):
    text = re.sub(r"@\S+", "@user", text)
    text = re.sub(r"https?://\S+", "link", text)
    encoded_input = np.asarray(
        [
            vec_model.get_word_vector(word)
            for word in text.rstrip().split()
        ],
        dtype=np.float32,
    )
    encoded_input = torch.from_numpy(np.array([encoded_input]))
    return encoded_input

def get_class(text: str):
    seq = text_to_seq(text)
    pred = torch.argmax(model(seq), dim=1)
    return pred[0].item()

## Oznaczenie tweetów o tematyce politycznej

In [42]:
df['offensive'] = df['tweet'].progress_apply(lambda x: get_class(x))
df.head()

100%|██████████| 312502/312502 [11:11<00:00, 465.65it/s]


Unnamed: 0,id,conversation_id,datetime,user_id,place,tweet,lang,replies_count,retweets_count,likes_count,retweet,near,geo,hashtags,place.type,place.coordinates,offensive
0,1471549573017608194,1471427657552343049,2021-12-16 19:35:32 Środkowoeuropejski czas st...,3301872724,,@karnkowski @Jaroslaw_Gowin miał szansę być an...,pl,0,0,1,False,,,[],,,1
1,1471435153817976834,1471237832199098375,2021-12-16 12:00:53 Środkowoeuropejski czas st...,938387588414607360,,"@GiertychRoman @Jaroslaw_Gowin , życzenia od M...",pl,0,0,0,False,,,[],,,0
2,1471238115931176970,1471178290429837316,2021-12-15 22:57:55 Środkowoeuropejski czas st...,1098620540338749440,,@Polina12180179 @Jakub_Banaszek Za mało za zdr...,pl,0,0,0,False,,,[],,,0
3,1471150229189799936,1471118304668532755,2021-12-15 17:08:41 Środkowoeuropejski czas st...,620191832,,@UrynowiczTomasz @Jaroslaw_Gowin @Wyborcza_Kra...,pl,0,0,1,False,,,[stopsegregacjisanitarnej],,,0
4,1471116644739133451,1470800759243489285,2021-12-15 14:55:14 Środkowoeuropejski czas st...,1137128289380773888,,@KOdowy @Mentasen @Hanna_Manowska @MZ_GOV_PL @...,pl,1,0,0,False,,,[],,,1


In [43]:
all_tagged = df.apply(lambda x: x.to_json(), axis=1)
with open("all_tagged.jl", "w") as f:
    for line in all_tagged:
        f.write(line + "\n")

In [44]:
import plotly.express as px

n_all = len(df)
n_offensive = len(df[df["offensive"] == 1])

data = {
    "Names": ["Nie zawiera mowy ofensywnej", "Zawiera mowę ofensywną"],
    "Percentage": [
        1 - n_offensive / n_all,
        n_offensive / n_all,
    ],
}

df_percentage = pd.DataFrame(data)

fig = px.pie(
    df_percentage,
    values="Percentage",
    names="Names",
    title="Posty zawierające mowę ofensywną - tweety polityczne",
    color=df_percentage.index,
)
fig.update_layout(dict(width=600, height=400))

fig.show()

## Oznaczenie zbioru aktualnie popularnych tweetów (mniej kontrowersyjnych)

In [31]:
df_popular = read_jsonline("../datasets/popular_now.jl")
df_popular = df_popular.drop_duplicates(subset='id', keep="last").reset_index(drop=True)

# tylko polskie tweety
df_popular = df_popular[df_popular["lang"] == "pl"]

df_popular['offensive'] = df_popular['tweet'].progress_apply(lambda x: get_class(x))

100%|██████████| 4563/4563 [00:12<00:00, 356.69it/s]


In [32]:
df_popular["offensive"].value_counts()

3918

In [33]:
len(df_popular)

4563

In [40]:
import plotly.express as px

n_all = len(df_popular)
n_offensive = len(df_popular[df_popular["offensive"] == 1])

data = {
    "Names": ["Nie zawiera mowy ofensywnej", "Zawiera mowę ofensywną"],
    "Percentage": [
        1 - n_offensive / n_all,
        n_offensive / n_all,
    ],
}

df_percentage = pd.DataFrame(data)

fig = px.pie(
    df_percentage,
    values="Percentage",
    names="Names",
    title="Posty zawierające mowę ofensywną - popularne tweety",
    color=df_percentage.index,
)
fig.update_layout(dict(width=600, height=400))

fig.show()