# From archives of twitter api, extract tweets in polish, with no attached media

In [1]:
import pandas as pd
import pathlib
import bz2
import json
import concurrent.futures
from tqdm import tqdm
import re
import langdetect
import pickle
import fastprogress

In [2]:
source_dir = pathlib.Path('/mnt/dane/twitter/')

In [3]:
source_files = [x for x in source_dir.iterdir() if '.tar.pkl' in x.name]

In [4]:
def get_texts(sfile):
    tweets = [tweet for sub in pickle.load(open(sfile, 'rb')) for tweet in sub]
    texts = []
    for t in tweets:
        retw = t.get('retweeted_status')
        current = retw or t
        if current.get('extended_tweet'):
            continue
        if current['truncated']:
            continue
        entities = current.get('entities', {})
        if entities.get('media') or len(entities.get('urls', []))>0:
            continue
        texts.append(current['text'])
    return texts


In [5]:
sub_texts = [get_texts(f) for f in tqdm(source_files)]

100%|██████████| 292/292 [02:50<00:00,  1.54it/s]


In [6]:
texts = [t for sub in sub_texts for t in sub]

In [8]:
def get_proba_of_lang(t: str, lang:str = 'pl'):
    try:
        langs = langdetect.detect_langs(t)
    except langdetect.lang_detect_exception.LangDetectException:
        return 0
    return next(iter([x.prob for x in langs if x.lang == lang]), 0)


In [9]:
with concurrent.futures.ProcessPoolExecutor(16) as pool:
    results = list(tqdm(pool.map(get_proba_of_lang, texts), total=len(texts)))

100%|██████████| 1291546/1291546 [10:33<00:00, 2038.28it/s] 


In [10]:
df = pd.DataFrame({'text': texts, 'probability': results})

In [11]:
df = df.loc[df.probability.gt(0.99)]

In [12]:
filtered = df.text

In [14]:
len(filtered)

905780

In [15]:
at_mention = re.compile('@[a-zA-Z0-9_]*')
def remove_handles(t):
    return re.sub(at_mention, '@anonymized_account', t)

In [16]:
def replace_newlines(t):
    return t.replace('\n', '\\n')

In [31]:
URL = re.compile(' ?https?://.*')
def replace_urls(t):
    return re.sub(URL, ' @url', t)

In [32]:
processed = filtered.apply(remove_handles)

In [33]:
processed = processed.apply(replace_urls)

In [34]:
# processed = processed.apply(replace_newlines)

In [35]:
labels = pd.Series(index=processed.index).fillna('?')

In [36]:
resdf = pd.concat([labels, processed], axis=1)

In [37]:
# # texts = [t.replace('\\n', '\n') for t in texts]
# texts = [t.replace('\\"', '"') for t in texts]
# URL = re.compile(' ?https?://.*')
# texts = [URL.sub(' @url', t) for t in texts]

In [40]:
len(resdf)

905780

In [45]:
resdf = resdf.drop_duplicates()

In [47]:
len(resdf)

720706

In [49]:
resdf.to_csv('/home/tomasz/nlp/poleval/csv/unsup-twitter.csv', index=False, header=False)

In [55]:
labelled = pd.read_csv('/home/tomasz/nlp/poleval/csv/task1_train.csv', header=None, names=resdf.columns)

In [60]:
alldf = pd.concat([resdf, labelled], axis=0)

In [61]:
alldf.head()

Unnamed: 0,0,text
2,?,"słowo ""kocham"" powinno być cenne, a jest naduż..."
3,?,"ja tu nie wytrzymam półtorej godziny, no ludzi..."
6,?,@anonymized_account nie o tym mówię;/////\n\njest
9,?,nie chce mi się iść spać ale mi się chce \nO K...
12,?,Na razie tempo mocno wakacyjne. Roma na stojak...


In [62]:
alldf.tail()

Unnamed: 0,0,text
10036,0,@anonymized_account Ty zagrasz? Nie wiedziałem 😉
10037,0,@anonymized_account @anonymized_account A VAR ...
10038,0,@anonymized_account @anonymized_account Szanow...
10039,0,@anonymized_account @anonymized_account @anony...
10040,0,@anonymized_account A wróżbita Maciej mówi że ...


In [64]:
alldf.to_csv('/home/tomasz/nlp/poleval/csv/unsup.csv', index=False, header=False)