In [4]:
%matplotlib inline

import json
import csv
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import chi2
import numpy as np
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.manifold import MDS, TSNE, LocallyLinearEmbedding, Isomap
from MulticoreTSNE import MulticoreTSNE
from sklearn.decomposition import TruncatedSVD, PCA, KernelPCA, SparsePCA
import umap
import pandas as pd
from gb_writer import GlyphboardWriter
# from sklearn.model_selection import GridSearchCV
from typing import Any
import keras
from keras.models import Sequential, Model
from keras.layers import Dense
from keras.optimizers import Adam
import spacy
from spacy.lang.de.stop_words import STOP_WORDS


nlp = spacy.load('de')

In [40]:
# Classifiers
SGD = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3,
                    random_state=42, max_iter=5, tol=None)
MNB = MultinomialNB()
LR = LogisticRegression()
SVC = LinearSVC()

vec = TfidfVectorizer()
SPLICE_POINT = 800

def initData():
    texts = []
    labels = []
    ids = []
    selection_score = []
    peer_labels = []
    with open("test_data.json", "r") as read_file:
        test_data = json.load(read_file)

    

    for doc in test_data:
        ids.append(doc['id'])
        texts.append(doc["values"]["7"])
        selection_score.append(1-doc["features"]["1"]["4"])
        peer_labels.append(doc["features"]["1"]["4"])
        # labels.append(doc["features"]["1"]["4"])
        if (doc["features"]["1"]["4"] > 0.5):
            labels.append(1)
        else:
            labels.append(0)

    df = pd.DataFrame({
        'id': ids,
        'text': texts,
        'label': labels,
        'score': selection_score,
        'peer_label': peer_labels
    })

    # unlabeled = df[:200]    
    test_data = df[SPLICE_POINT+1:]
    test_data.to_csv('test_data.csv', sep=";", encoding="utf8", index=False)
    
    df.loc[:, 'label'] = 0.5
    df.to_csv('data.csv', sep=";", encoding="utf8", index=False)



def loadData():
    return pd.read_csv('data.csv', sep=";", encoding="utf8")

def handleNewAnswer(answer):    
    newAnswer = {
        'text': answer['text'],
        'docId': answer['documentId'],
        'label': int(answer['answer']),
        'question': answer['questionId']
    }
    train_data = getTrainData()

    test_data = pd.read_csv(
        'test_data.csv', delimiter=';', encoding="utf8")

    
    data = updateDataWithLabel(newAnswer['docId'], newAnswer['label'])

    if len(train_data) > 3:
        tfidf = vec.fit_transform(data.text)
        positions = applyDR(tfidf, data.label)
        train_result = train(train_data, test_data, MNB)
        return json.dumps({
            'positions': positions,
            'train_result': train_result
        })
    else:
        return ''

def updateDataWithLabel(docId, label):
    data = loadData()
    print('before', data.loc[data['id'] == docId])
    data.loc[data['id'] == docId, 'label'] = int(label)
    print('after', data.loc[data['id'] == docId])
    data.to_csv('data.csv', sep=";", encoding="utf8", index=False)

    return data


def createMetrics(algo):
    test_data = loadData()
    train_data_df = pd.read_csv("training_data.csv", sep=";")
    train_data_df.label = train_data_df.label.astype(int)
    met = []
    # Create stepwise metrics algo, simulating a history
    for number in range(30, len(train_data_df)):
        train_data_iteration = train_data_df.head(number)
        met.append(train(train_data_iteration, test_data, algo=algo))
    return pd.DataFrame(met)


def train(train_data, test_data, algo: Any) -> dict:
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', algo),
    ])
    text_clf.fit(train_data.text, train_data.label)
    # text_clf.fit(clean_data, labels)
    predicted = text_clf.predict(test_data.text)
    # print(dataframe.prediction.value_counts())
    # print(metrics.classification_report(test_labels, predicted))
    addHistory(metrics.f1_score(test_data.label, predicted))
    result = {
        'precision': metrics.precision_score(test_data.label, predicted),
        'recall': metrics.recall_score(test_data.label, predicted),
        'f1': metrics.f1_score(test_data.label, predicted),
        'f1_history': getHistory()
    }
    return result

def getTrainData():
    data = loadData()
    return data.loc[data['label'] != 0.5]

def resetTrainData():
    data = loadData()
    data.loc[:, 'label'] = 0.5
    data.to_csv('data.csv', sep=";", encoding="utf8", index=False)

def cleanupTexts():
    data = loadData()
    clean_data = [preprocessText(text) for text in data.text]

def mockTraining(amount):
    data = loadData()
    for i in range(amount):
        if data.loc[i].peer_label > 0.5:
            data.loc[i, 'label'] = 1
        else:
            data.loc[i, 'label'] = 0
    data.to_csv('data.csv', sep=";", encoding="utf8", index=False)

def getHistory():
    history = []
    with open(
            "metrics.csv", "r", encoding="utf8") as file:
        reader = csv.reader(file, delimiter=';')
        for line in reader:
            history.append(line[0])
        file.close()
    return history


def addHistory(metrics):
    with open(
            "metrics.csv", "a",  newline="", encoding="utf8") as file:
        writer = csv.writer(file, delimiter=';')
        writer.writerow([str(metrics)])
        file.close()

def getCurrentScore() -> int:
    return getHistory().pop()

def applyDR(tfidf, labels = []):    
    # pre_computed = TruncatedSVD(n_components=100, random_state=1).fit_transform(tfidf.toarray())
    LABEL_IMPACT = 0.6
    labels_arr = np.asarray(labels) * LABEL_IMPACT
    labels_arr = labels_arr.reshape(len(labels_arr), 1)
    with_labels = np.hstack((tfidf.toarray(),labels_arr))
        
    computed_coords = umap.UMAP(min_dist=0.8, random_state=1).fit_transform(with_labels)
    # computed_coords = MulticoreTSNE(n_jobs=4, random_state=1).fit_transform(with_labels)
    df = pd.DataFrame(columns=['x', 'y'])
    df['x'] = computed_coords[:, 0]
    df['y'] = computed_coords[:, 1]

    writer = GlyphboardWriter('test_name')

    # DR *= 2
    print('Writing positions...')    
    positions = writer.write_position(df, 'lsi')
    return positions

def preprocessText(text: str) -> str:
    # print('Original: ', text)
    doc = nlp(text)
    # Remove Stop Words and get Lemmas
    return ' '.join([token.text for token in doc if not token.is_stop])
    # for word in doc:
    #     if word.is_stop == True:
    #         print('Stop %s', word)
    # print(word.lemma_)

In [8]:
data = getTestData()
vec = TfidfVectorizer()
tfidf = vec.fit_transform(data.text)

In [9]:
# lsi = TruncatedSVD(n_components=2, random_state=1).fit_transform(tfidf.toarray())
# lsi = PCA(n_components=10, random_state=1).fit_transform(tfidf.toarray())

labels = np.asarray(data.label)
labels = labels.reshape(len(labels), 1)
# with_labels = np.hstack((lsi,labels))
with_labels = np.hstack((tfidf.toarray(),labels))


# m = Sequential()
# m.add(Dense(512,  activation='elu', input_shape=(101,)))
# m.add(Dense(128,  activation='elu'))
# m.add(Dense(2,    activation='linear', name="bottleneck"))
# m.add(Dense(128,  activation='elu'))
# m.add(Dense(512,  activation='elu'))
# m.add(Dense(101,  activation='sigmoid'))
# m.compile(loss='mean_squared_error', optimizer = Adam())
# history = m.fit(with_labels, with_labels, batch_size=128, epochs=5, verbose=1)

# encoder = Model(m.input, m.get_layer('bottleneck').output)
# lsi_with_labels = encoder.predict(with_labels)

# lsi_with_labels = PCA(n_components=2, random_state=1).fit_transform(with_labels)
# lsi_with_labels = TruncatedSVD(n_components=2, random_state=1).fit_transform(with_labels)
# lsi_with_labels = Isomap(n_components=2).fit_transform(with_labels)
# lsi_with_labels = MulticoreTSNE(n_jobs=1, n_components=2, random_state=1).fit_transform(with_labels)
# lsi_with_labels = LocallyLinearEmbedding(n_jobs=4,n_components=2, random_state=1).fit_transform(with_labels)
lsi_with_labels = umap.UMAP(random_state=1).fit_transform(with_labels)
df = pd.DataFrame(columns=['x', 'y'])
df['x'] = lsi_with_labels[:, 0]
df['y'] = lsi_with_labels[:, 1]
    
    
df *= 1
# DR.x *= 500
# DR.y *= 500

# print(df)

writer = GlyphboardWriter('test_name')

# DR *= 2
print('Writing positions...')    
writer.write_position(df, 'lsi')
del df
print('done')

Writing positions...
done


In [10]:
MNB = MultinomialNB()
train_data = pd.read_csv(
        'gb_training.csv', delimiter=';', encoding="utf8")
# train(train_data, data[201:], MNB)
data.loc[:800, 'label'] = 0.5
test_data = data[800+1:]
test_data

Unnamed: 0,label,text
801,0.0,"Messe the wedding showroom ""Kissed by the sun"""
802,1.0,"Sommerakademie 2014 ""Mystik und Alltag im int..."
803,0.0,Die Gans für 4 Personen zum Martinsfest Eine k...
804,1.0,Weiberfasching mit GraceComany Weiberfasching ...
805,0.0,Gästeführer Gartenerlebnis Bayern in Bayreuth ...
806,1.0,Jubiläumskonzerte im Schloss - 10 Jahre Berlin...
807,0.0,Seit 1992 wird das Mittelalterfest auf Schloss...
808,1.0,Eisdisko Eisdisko auf dem Konzertplatz. Da wer...
809,1.0,Wohnzimmermusik - Andrew Medwed Andrew Medwed ...
810,0.0,Die Verwandlung Eine Tanzproduktion nach der g...


In [19]:
data = loadData()
data.loc[data['label'] != 0.5]

Unnamed: 0,id,label,peer_label,score,text
0,1,0.0,0.218750,0.781250,Silvester 2.014 AAAAAAlllttterrr Schwedeee.......
1,2,1.0,1.000000,0.000000,Live in Concert: Trabant Echo TRABANT ECHO / H...
2,3,1.0,1.000000,0.000000,Jazz im Sein. mit dem Jan-Phillip Meyer Quinte...
3,4,1.0,0.968750,0.031250,BUNKER BDAY ATTACK @ BERLIN ☭ ☭ ☭ ☞ AM 18.12 H...
4,5,1.0,1.000000,0.000000,THE TUNES live im Lehmitz - Mi. 28.05.14 Aktue...
5,6,0.0,0.062500,0.937500,"Kiezmenü ""Kakao gegen den Novemberblues"" am Mi..."
6,7,0.0,0.000000,1.000000,Drachenfest die besten drei Drachen werden prä...
7,8,0.0,0.000000,1.000000,"Annett Gröschner liest ""Magdeburger Geschichte..."
8,9,0.0,0.000000,1.000000,"""Sülverhochtied"""
9,10,0.0,0.000000,1.000000,Advents-Buffet Zusätzlich zu unserer Abendkart...


In [47]:
data = loadData()

In [53]:
for idx, text in enumerate(data.text):
#     print(idx, text)
    data.loc[idx, 'text'] = preprocessText(text)
    
data.to_csv('data.csv', sep=";", encoding="utf8", index=False)

In [54]:
data

Unnamed: 0,id,label,peer_label,score,text
0,1,0.0,0.218750,0.781250,Silvester 2.014 AAAAAAlllttterrr Schwedeee ......
1,2,1.0,1.000000,0.000000,Live Concert : Trabant Echo TRABANT ECHO / HYP...
2,3,1.0,1.000000,0.000000,Jazz Sein . Jan-Phillip Meyer Quintett Die Ban...
3,4,1.0,0.968750,0.031250,BUNKER BDAY ATTACK @ BERLIN ☭ ☭ ☭ ☞ AM 18.12 H...
4,5,1.0,1.000000,0.000000,THE TUNES live Lehmitz - Mi. 28.05.14 Aktuelle...
5,6,0.0,0.062500,0.937500,"Kiezmenü "" Kakao Novemberblues "" Mittwoch Nove..."
6,7,0.0,0.000000,1.000000,Drachenfest Drachen prämiert .
7,8,0.0,0.000000,1.000000,"Annett Gröschner liest "" Magdeburger Geschicht..."
8,9,0.0,0.000000,1.000000,""" Sülverhochtied """
9,10,0.0,0.000000,1.000000,Advents-Buffet Zusätzlich Abendkarte bieten do...
