# Create DIKI Large using Word Embeddings

Meta information removed for blind review.

In [1]:
import pandas as pd

#### Read DIKI small

In [2]:
df = pd.read_csv('DIKI_small.csv')

In [3]:
df.head()

Unnamed: 0,key words
0,"""absurdistan"""
1,"""bild-dungsb√ºrger"""
2,"""eierlosen"""
3,"""entsorgen"""
4,"""entsorgt"""


In [4]:
len(df)

2874

In [5]:
low_incivility_unigrams = list(df["key words"])

In [6]:
low_incivility_unigrams[:20]

['"absurdistan"',
 '"bild-dungsb√ºrger"',
 '"eierlosen"',
 '"entsorgen"',
 '"entsorgt"',
 '"fl√ºchtling"',
 '"fl√ºchtlinge"',
 '"friedenstaube"',
 '"g√§sten"',
 '"heimkehrer"',
 '"integrieren"',
 '"m√§nnergruppen"',
 '"pirat"',
 '"schiffbr√ºchigen"',
 '"schutzberechtigten"',
 '"sozialpopulist"',
 '"spitzenmann"',
 '"w√§hler"',
 '"zecken"',
 '"ziegenhirten"']

## Extend DIKI small with Word Embedding Model

In [7]:
import gensim
from gensim.models.wrappers import FastText

In [8]:
model = FastText.load_fasttext_format('cc.de.300.bin') #from https://fasttext.cc/docs/en/crawl-vectors.html

In [9]:
model.most_similar("gepisst", topn=10) #test haha

[('abgepisst', 0.8173563480377197),
 ('vollgepisst', 0.7921205759048462),
 ('gepinkelt', 0.7558916807174683),
 ('gekackt', 0.7330619096755981),
 ('pisst', 0.7125478982925415),
 ('pissen', 0.7026624083518982),
 ('bepisst', 0.6910557150840759),
 ('pisse', 0.6593301296234131),
 ('gepullert', 0.6518568396568298),
 ('angepisst', 0.6470769047737122)]

#### Get the 10 most similar words for each entry in DIKI small

In [10]:
word_embeddings_10 = []

for i in low_incivility_unigrams:
    try:
        top10= model.most_similar(i, topn=10)
        for e in top10:
            word_embeddings_10.append(e)
    except:
        print(i) #Entries of DIKI small that are not in the model
        pass

#fckafd
#fckcdu
#fckdlf
#fcknzs
#fcksky
#msm
#noafd
#nospd
*opfaopfaopfa*
ar***
ars***
fckafd
geb*****t
hdf
kot***
nsafd
opfa
sc**i√üe
sch‚Ä¶
ww3
‚ò†Ô∏è
‚ò†Ô∏èüò¶üò¶‚ò†Ô∏è
‚ò™Ô∏è
‚òªüëπü§ò
‚úàÔ∏è
‚úãüëç
üá©üá™
üá©üá™üëç
üå¨Ô∏è
üëä
üëçüá©üá™
üëé
üëéüèΩ
üëéüñï
üëπ
üëø
üíîüò°üò°üëéüèΩüëéüèΩüëéüèΩ
üí®
üí©
üí©ü§¢
üñï
üñïüèª
üñïüèΩ
üòÇüòÇüëç
üòÇüòÇüòÇüëéüñï
üòà
üòâüëé
üòî
üòõ
üòõüñïüëé
üòú
üòúüëéüñï
üò†
üò°
üò°üëé
üò§
üò¶
üò®
üò®üëé
üò®üò§
üò©
üò´
üò¨
üòµ
üòæ
üôÑ
üôéüèø‚Äç‚ôÇÔ∏è
üöÄ
üöΩ
üõ§
ü§ê
ü§ò
ü§õ
ü§°
ü§¢
ü§¢üí©
ü§£
ü§¶‚Äç
ü§¶üèª
ü§´
ü§¨
ü§Æ
ü§∑üèΩ
ü§ºüèª


In [11]:
word_embeddings_10 = list(set(word_embeddings_10))

In [12]:
len(word_embeddings_10)

27800

In [13]:
word_embeddings_10[:20]

[('Gro√üfahrzeugen', 0.46548447012901306),
 ('Theatermensch', 0.4346446394920349),
 ('irrsinniger', 0.5550733804702759),
 ('StarMorgansMoulinexnilcoNilfiskNogmaticNovamaticOBHOMEGAPANASONICParksidePhilipsPrimeraPRIVILEGProfectisPROGRESSProfiloProtosQuiggRotelRowentaRowiSaphirSalcoSamsungSatrapSeverinShopVacSidemSidemeSidexSIEMENSSimpaSingerSilvaSimpexStandardSMCSolacSuperiorSwirlTaurusTCM',
  0.7358262538909912),
 ('bekloppte', 0.6523733139038086),
 ('ARIArieteArrowheadAthenaBeruBMWBoschBuzettiChampionDAYCODellortoDIDDucatiEBCEXIDEFehlingFujiHAWKERHIFLOJMPJMTJTK',
  0.6052872538566589),
 ('schwulenfeindlicher', 0.5615580081939697),
 ('Leitmetzerin', 0.5199687480926514),
 ('Philanthropismus', 0.4906107187271118),
 ('Ahmadi-Muslime', 0.5245018601417542),
 ('schniedel', 0.587764322757721),
 ('pussy', 0.6608189344406128),
 ('perverses', 0.6499999165534973),
 ('Knallcharge', 0.529083788394928),
 ('A-ZBergschuleAusr√ºstungs-Top-10Die', 0.4964744448661804),
 ('Klimamodell', 0.6885697841644287

#### Set Threshold (Consine Similarity) to 0.6

In [14]:
word_embeddings_10_06=[]
for i in word_embeddings_10:
    if i[1] > 0.6:
        word_embeddings_10_06.append(i)

In [15]:
len(word_embeddings_10_06)

8588

In [16]:
word_embeddings_10_06[:10] #some wierd stuff in there...

[('StarMorgansMoulinexnilcoNilfiskNogmaticNovamaticOBHOMEGAPANASONICParksidePhilipsPrimeraPRIVILEGProfectisPROGRESSProfiloProtosQuiggRotelRowentaRowiSaphirSalcoSamsungSatrapSeverinShopVacSidemSidemeSidexSIEMENSSimpaSingerSilvaSimpexStandardSMCSolacSuperiorSwirlTaurusTCM',
  0.7358262538909912),
 ('bekloppte', 0.6523733139038086),
 ('ARIArieteArrowheadAthenaBeruBMWBoschBuzettiChampionDAYCODellortoDIDDucatiEBCEXIDEFehlingFujiHAWKERHIFLOJMPJMTJTK',
  0.6052872538566589),
 ('pussy', 0.6608189344406128),
 ('perverses', 0.6499999165534973),
 ('Klimamodell', 0.6885697841644287),
 ('hellbraunen', 0.760926365852356),
 ('geldsklaven', 0.667251706123352),
 ('amateurin', 0.6524983048439026),
 ('Luftpumpe', 0.6812489032745361)]

In [17]:
word_embeddings_10_06_low = list(set([x[0].lower() for x in word_embeddings_10_06]))
#make lower case and remove duplicates

In [18]:
len(word_embeddings_10_06_low)

5510

In [22]:
DIKI_large = list(set(word_embeddings_10_06_low + low_incivility_unigrams))
#DIKIsmall + Synonyms

In [23]:
df_DIKIlarge = pd.DataFrame({'key words':DIKI_large})
df_DIKIlarge

Unnamed: 0,key words
0,prek√§rster
1,intoleranz
2,"""eierlosen"""
3,faseln
4,#furchtbar
...,...
7391,weinerlich
7392,staatsfunktionen
7393,rummotzen
7394,bauelementefestwiderst√§ndesmd-chip-widerst√§nde...


In [24]:
df_DIKIlarge.to_csv("DIKI_large.csv", index=None, encoding="utf-8")