https://docs.google.com/spreadsheets/d/1XttPTaqV1D_W7dfYqC_4pypOlvnUSM0Itk2nXmv8Yrc/edit#gid=1569942191%5C

In [1]:
import re

import numpy as np
import pandas as pd

In [2]:
mdf_v2 = pd.read_csv("mdf_v2.csv.zip", compression="zip", index_col=0)
mdf_v2.head()

Unnamed: 0,word,positive
0,\bбезопасн+,True
1,\bмир+,True
2,\bсострада+,True
3,\bэмпат+,True
4,\bсочувств+,True


In [3]:
sheet = pd.read_excel("morality-dictionary.xlsx")
sheet = sheet[["translation", "vice_virtue"]]
sheet = sheet.dropna()
sheet["translation"] = sheet["translation"].str.replace(", ", ",")
sheet["translation"] = sheet["translation"].str.strip().replace(" ", ",")
sheet.head()

Unnamed: 0,translation,vice_virtue
0,безопасн*,virtue
1,мир*,virtue
2,сострада*,virtue
3,эмпат*,virtue
4,сочувств*,virtue


In [4]:
sheet["vice_virtue"] = sheet.apply(lambda row: row["vice_virtue"].split(",")[0], axis=1)
sheet = sheet[sheet["vice_virtue"].isin(["vice", "virtue"])]

In [5]:
sheet["postive"] = sheet["vice_virtue"] == "virtue"
del sheet["vice_virtue"]
sheet.head()

Unnamed: 0,translation,postive
0,безопасн*,True
1,мир*,True
2,сострада*,True
3,эмпат*,True
4,сочувств*,True


In [6]:
sheet.tail()

Unnamed: 0,translation,postive
313,"зло*,злю*,зла*,злу",False
314,"порок*,порочн*",False
315,эксплуата*,False
316,извращен*,False
317,несчаст*,False


In [7]:
mdf = []
for _, (words_str, positive) in sheet.iterrows():
    words = words_str.split(",")
    if len(words) == 1:
        words = words[0].strip().split()
    for word in words:
        mdf.append({"word": word, "positive": positive})
mdf = pd.DataFrame(mdf)

In [8]:
mdf["word"] = mdf["word"].apply(lambda x: r"\b" + x.replace("*", "+") if "*" in x else r"\b" + x + r"\b")

In [9]:
mdf.to_csv("morality.csv.zip", compression="zip")

In [10]:
combined = pd.concat([mdf_v2, mdf])
combined.to_csv("morality_with_v2.csv.zip", compression="zip")

In [11]:
combined.groupby(combined.columns.tolist(),as_index=False).size()

Unnamed: 0,word,positive,size
0,\bавторит+,True,2
1,\bагресс+,False,2
2,\bаморал+,False,1
3,\bатак+,False,2
4,\bбеззаконн+,False,2
...,...,...,...
349,\bэксплуатат+,False,2
350,\bэксплуатац+,False,2
351,\bэксплуатир+,False,2
352,\bэмпат+,True,2


In [12]:
text = "атак закон"

In [13]:
mdf_v2_positive = re.compile("(" + "|".join(mdf_v2[mdf_v2["positive"]]["word"]) + ")")
mdf_v2_negative = re.compile("(" + "|".join(mdf_v2[~mdf_v2["positive"]]["word"]) + ")")
mdf_positive = re.compile("(" + "|".join(mdf[mdf["positive"]]["word"]) + ")")
mdf_negative = re.compile("(" + "|".join(mdf[~mdf["positive"]]["word"]) + ")")
combined_positive = re.compile("(" + "|".join(combined[combined["positive"]]["word"]) + ")")
combined_negative = re.compile("(" + "|".join(combined[~combined["positive"]]["word"]) + ")")

In [14]:
assert mdf_v2_positive.findall(text) == mdf_positive.findall(text) == combined_positive.findall(text)
%timeit mdf_v2_positive.findall(text)
%timeit mdf_positive.findall(text)
%timeit combined_positive.findall(text)

1.29 µs ± 148 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
1.36 µs ± 66.7 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
1.98 µs ± 86.1 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


In [15]:
assert mdf_v2_negative.findall(text) == mdf_negative.findall(text) == combined_negative.findall(text)
%timeit mdf_v2_negative.findall(text)
%timeit mdf_negative.findall(text)
%timeit combined_negative.findall(text)

1.18 µs ± 28.3 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
1.36 µs ± 135 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
1.85 µs ± 67.2 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
