In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

np.random.seed(420)

%matplotlib inline

In [3]:
df = pd.read_csv(r"T:\Разное\Папка Тимика\Codifs\xls parser\df_processed.csv", engine="python", encoding="utf-8")
df = df.drop(columns=["Type"])
print(df.shape)
df.head()

(84373, 2)


Unnamed: 0,Text,Code
0,дельфинарий в зоопарке,0
1,хзш,1
2,карта кукуруза,1
3,аффрин,0
4,тлртлотлотдотдот,1


In [4]:
from bokeh.layouts import gridplot
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook

output_notebook()


def make_plot(title, hist, edges, x):
    p = figure(title=title, tools='', background_fill_color="#fafafa")
    p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
           fill_color="navy", line_color="white", alpha=0.5)

    p.y_range.start = 0
    p.legend.location = "center_right"
    p.legend.background_fill_color = "#fefefe"
    p.xaxis.axis_label = 'x'
    p.yaxis.axis_label = 'Pr(x)'
    p.grid.grid_line_color="white"
    return p

lens = df.Text.str.len()
hist, edges = np.histogram(lens, density=True, bins=50)
x = np.arange(0, lens.max(), 5)

show(make_plot("Length distribution", hist, edges, x))

In [45]:
lens.median()

12.0

In [83]:
lens.describe()

count    84373.000000
mean        18.217084
std         18.889894
min          1.000000
25%          7.000000
50%         12.000000
75%         22.000000
max        140.000000
Name: Text, dtype: float64

In [155]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

def text2seq(str_len, df):
    tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
    tk.fit_on_texts(df.Text)

    alphabet="0123456789йцукенгшщзхъфывапролджэячсмитьбю !,.?"
    if len(alphabet) != len(set("".join(df.Text))):
        print("Alphabets are not a match!")

    char_dict = {}
    for i, char in enumerate(alphabet):
        char_dict[char] = i + 1
    vocab_size = len(char_dict)
    
    tk.word_index = char_dict.copy()
    tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

    X = tk.texts_to_sequences(df.Text)
    X = pad_sequences(X, maxlen=str_len, padding='post', truncating='post')

    return X

str_len = 50
X = text2seq(str_len, df)
y = np.array(df.Code)

In [186]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Input
from keras.layers.embeddings import Embedding


def build_model(embedding_dim, str_len):
    vocab_size = len(alphabet) + 1
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=str_len))
    model.add(Flatten())
    model.add(Dense(1, activation="sigmoid"))
    return model

model = build_model(len(alphabet), str_len)
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 50, 47)            2256      
_________________________________________________________________
flatten_6 (Flatten)          (None, 2350)              0         
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 2351      
Total params: 4,607
Trainable params: 4,607
Non-trainable params: 0
_________________________________________________________________
None


In [160]:
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=10000, random_state=420, stratify=y)

X_train.shape

(74373, 50)

In [177]:
model.fit(X_train, y_train, epochs=10, verbose=1)
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.6144


In [170]:
y_res = model.predict(X_test)
y_res.mean()

0.51332873

In [165]:
loss, accuracy = model.evaluate(X, y, verbose=1)



In [183]:
from sklearn.feature_extraction.text import TfidfVectorizer

char_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    analyzer='char',
    ngram_range=(1, 6),
    max_features=30000)

tfidf_char = char_vectorizer.fit_transform(df.Text)

tfidf_char

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


<84373x30000 sparse matrix of type '<class 'numpy.float64'>'
	with 5482055 stored elements in Compressed Sparse Row format>

In [187]:
X_train, X_test, y_train, y_test = train_test_split(
    tfidf_char, y, test_size=10000, random_state=420, stratify=y)

X_train.shape

(74373, 30000)

In [188]:
model = Sequential()
model.add(Dense(1, input_shape=(X_train.shape[1],), activation="sigmoid"))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_9 (Dense)              (None, 1)                 30001     
Total params: 30,001
Trainable params: 30,001
Non-trainable params: 0
_________________________________________________________________
None


In [189]:
model.fit(X_train, y_train, epochs=10, verbose=1)
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print(accuracy)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
0.8437


In [227]:
import g2p_en as g2p
from transliterate import translit
import re

tdict_path = r"C:\Users\khasanov\Desktop\transcript_dict.txt"
dict_file = open(tdict_path, mode='r', encoding='utf-8')
lines = dict_file.readlines()
transcript_dict = {}
for line in lines:
    line = line.strip()
    if len(line.split(":")) != 2:
        continue
    k, v = line.split(":")
    transcript_dict[r"\b"+k+r"\b"] = v
del lines
dict_file.close()


def transcript(word):
    assert isinstance(word, (str, int))
    word = str(word)
    phoneme_dict = {
        "OW0": "О",  # whats this
        "OW1": "О",
        "OW2": "О",
        "AA0": "А",
        "AA1": "А",
        "AA2": "А",
        "AE0": "А",
        "AE1": "А",
        "AE2": "А",
        "AH0": "А",
        "AH1": "А",
        "AH2": "А",
        "AO0": "О",
        "AO1": "А",
        "AO2": "А",
        "AW0": "АУ",
        "AW1": "АУ",
        "AW2": "АУ",
        "AY0": "АЙ",
        "AY1": "АЙ",
        "AY2": "АЙ",
        "B": "Б",
        "CH": "Ч",
        "D": "Д",
        "DH": "С",
        "EH0": "Э",
        "EH1": "E",
        "EH2": "Е",
        "ER0": "ЕР",
        "ER1": "ЕР",
        "ER2": "ЕР",
        "EY0": "ЕЙ",  # конец слова - Е
        "EY1": "ЕЙ",  # конец слова - Е
        "EY2": "ЕЙ",  # конец слова - Е
        "F": "Ф",
        "G": "Г",
        "HH": "Х",
        "IH0": "И",
        "IH1": "И",
        "IH2": "И",
        "IY0": "И",
        "IY1": "И",
        "IY2": "И",
        "JH": "ДЖ",
        "K": "К",
        "L": "Л",
        "M": "М",
        "N": "Н",
        "NG": "Н",  # в конце - НГ, NGS - НГС
        "OY0": "ОЙ",
        "OY1": "ОЙ",
        "OY2": "ОЙ",
        "P": "П",
        "R": "Р",
        "S": "С",
        "SH": "Ш",
        "T": "Т",
        "TH": "С",
        "UH0": "У",
        "UH1": "У",
        "UH2": "У",
        "UW": "У",
        "UW0": "У",
        "UW1": "Ю",
        "UW2": "Ю",
        "V": "В",
        "W": "В",
        "Y": "Ь",  # В основном идет перед UW*. Меняет его на Ю
        "Z": "З",
        "ZH": "Ж"
    }

    word = re.sub(r"\d", "", word)  # :TODO add digits support
    phonemes = g2p.g2p(word)  # list of phonemes

    if phonemes[-1] == "NG":
        phonemes[-1] = "N"
        phonemes.append("G")
    if phonemes[-2:] == ["NG", "Z"] or phonemes[-2:] == ["NG", "S"]:
        phonemes[-2], phonemes[-1] = "N", "G"
        phonemes.append("S")

    if "EY" in phonemes[-1]:
        phonemes[-1] = "EH1"
    if phonemes[-1] == "Z":
        phonemes[-1] = "S"

    transcripted_phonemes = [phoneme_dict[x] if x in phoneme_dict else " " for x in phonemes]

    if phonemes[0] == "Y":
        transcripted_phonemes[0] = "Й"

    return "".join(transcripted_phonemes).strip().lower()


def handle_latin(x):
    word = x.group(0)
    word = word.strip()
    if re.search(r"[a-z]", word):
        if re.search(r"[а-я]", word) or len(word) <= 3:
            return translit(word, "ru")
        else:
            return transcript(word)
    return word


def text_sub(text_cell):
    try:
        return re.sub(r"\b([a-zа-я])*\b", handle_latin, text_cell)
    except:
        return translit(text_cell, "ru")

In [None]:
test_df = pd.read_excel(r"T:\=Tiburon_NEW\!!!Проекты\Yandex\13262 Трекинг Яндекс Деньги\DP\opens\w3\A1 A3.xlsm", sheet_name="Data")
test_df = test_df.iloc[:,2:4]
test_df.columns = ["Text", "Code"]

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

def process_df(df):
    df = df[~pd.isnull(df.Text)]
    df.Text = df.Text.astype(str)
    df.Text = df.Text.replace(r'[^\w\d\.\,\?\! ]', ' ', regex=True)
    df.Text = df.Text.replace("\s\s+", " ")
    df.Text = df.Text.replace("ё", "е")
    df.Text = df.Text.replace(transcript_dict)
    df.Text = df.Text.str.strip().str.lower()
    
    return df

test_df = process_df(test_df).drop_duplicates("Text")
test_df.Text = test_df.Text.apply(text_sub)
test_df = process_df(test_df).drop_duplicates("Text")
test_df.head()

In [233]:
test_df = test_df[test_df.Text.str.contains("\w")]
test_df.shape

(1269, 2)

In [234]:
def text2predict(model, text, char_vectorizer):
    text2vec = char_vectorizer.transform(text)
    return model.predict(text2vec)

In [244]:
text2predict(model, ["прон"], char_vectorizer)

array([[0.38886216]], dtype=float32)

In [None]:
test_df["Predicted"] = text2predict(model, test_df.Text, char_vectorizer)

In [None]:
def check_match(i):
    code = 1 if test_df.loc[i, "Code"] == 99 else 0
    predict = 1 if test_df.loc[i, "Predicted"] >= 0.5 else 0
    return code == predict

test_df["Match"] = test_df.index.map(check_match)

In [243]:
test_df.Match.sum()/test_df.shape[0]

0.8786446020488574