In [None]:
from hdbcli import dbapi
import pandas as pd
import nltk as nltk
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem.cistem import Cistem
from nltk.stem import WordNetLemmatizer
from enum import Enum

In [None]:
connection = dbapi.connect('34.255.100.176', 39015, 'SYSTEM', 'Glorp2018!')
connection.isconnected()

In [None]:
cursor = connection.cursor()

# Teil 2

In [None]:
cmpl_data = pd.read_csv('./_TA_CDESCRIND__201911271735.csv')
t3n_data = pd.read_csv('./_TA_T3NTEXTIND__201912031756.csv')

In [None]:
t3n_data.head()

## Aufgabe 1 - Nouns per Document

### #Nouns per Document (Optional)

In [None]:
t3n_nouns = t3n_data[t3n_data['TA_TYPE']=="noun"]
nouns_per_doc = t3n_nouns["ID"].value_counts()
ax = sns.distplot(nouns_per_doc)
ax.set(xlabel='#nouns per document')
ax.set_title('Nouns per document')
fig = ax.get_figure()

### Using Pandas

In [None]:
cmpl_nouns = cmpl_data[cmpl_data['TA_TYPE']=="noun"]
cmpl_nouns.groupby('CMPLID')['TA_TOKEN'].value_counts()

def get_tokencount_per_document(doc_id, df):
    cmpl_nouns = df[df['TA_TYPE']=="noun"]
    cmpl_id = cmpl_nouns[cmpl_nouns['CMPLID'] == doc_id]
    return cmpl_id['TA_TOKEN'].value_counts()

### Using SQL-View

In [None]:
# drop SQL-View
sql_drop_view = 'drop view COUNT_NOUNS'
cursor.execute(sql_drop_view)

In [None]:
# nouns per document
sql = 'create view COUNT_NOUNS as select ID, TA_TOKEN, count(*) as COUNT from "$TA_T3NTEXTIND" where TA_TYPE=\'noun\' group by ID, TA_TOKEN'
cursor.execute(sql)

In [None]:
cursor.execute('select * from COUNT_NOUNS')
nouns_list = cursor.fetchall()
nouns_df = pd.DataFrame(nouns_list)
nouns_df.head(20)

## Aufgabe 2 Pandas Implementation 

### Size of Lexica before cleanup

In [None]:
cmpl_lexica_size = cmpl_data['TA_TOKEN'].nunique()
t3n_lexica_size = t3n_data['TA_TOKEN'].nunique()
cmpl_normalized_lexica_size = cmpl_data['TA_NORMALIZED'].nunique()
t3n_normalized_lexica_size = t3n_data['TA_NORMALIZED'].nunique()

### Size of Lexica without punctuation and stopwords

In [None]:
def remove_unneeded_token_types(data):
    TA_TYPES_TO_REMOVE = set({'punctuation', 'number'})
    return data[~data['TA_TYPE'].isin(TA_TYPES_TO_REMOVE)]

def remove_stopwords(data, language):
    #nltk.download(language)
    stopword_set = set(stopwords.words(language))
    data['TA_TOKEN_LOW'] = data['TA_TOKEN'].map(lambda row: str(row).lower())
    return data[~data['TA_TOKEN_LOW'].isin(stopword_set)]

def remove_unique_tokens(data):
    return data.groupby('TA_TOKEN').filter(lambda x: len(x) > 1)

In [None]:
cmpl_data_cleaned = remove_unneeded_token_types(cmpl_data)
print("size of cmpl lexica after punctuation and number removal: " + str(cmpl_data_cleaned['TA_TOKEN'].nunique()))
cmpl_data_cleaned = remove_stopwords(cmpl_data_cleaned, 'english')
print("size of cmpl lexica after stopword removal: " + str(cmpl_data_cleaned['TA_TOKEN'].nunique()))
cmpl_data_cleaned = remove_unique_tokens(cmpl_data_cleaned)
print("size of cmpl lexica after removing unique words: " + str(cmpl_data_cleaned['TA_TOKEN'].nunique()))

In [None]:
t3n_data_cleaned = remove_unneeded_token_types(t3n_data)
print("size of t3n lexica after punctuation and number removal: " + str(t3n_data_cleaned['TA_TOKEN'].nunique()))
t3n_data_cleaned = remove_stopwords(t3n_data_cleaned, 'german')
print("removing german stopwords. #Tokens left: " + str(t3n_data_cleaned['TA_TOKEN'].nunique()))
t3n_data_cleaned = remove_unique_tokens(t3n_data_cleaned)
print("removing unique tokens. #Tokens left: " + str(t3n_data_cleaned['TA_TOKEN'].nunique()))

### Size of Lexica after Stemming  / Lemmatizing

In [None]:
stemmer = Cistem() # German language stemmer
t3n_data_cleaned['TA_STEMMED'] = t3n_data_cleaned['TA_NORMALIZED'].map(lambda token: stemmer.stem(str(token)))
print("t3n lexica size before stemming: " + str(t3n_data_cleaned['TA_TOKEN'].nunique()))
print("t3n lexica size after stemming: " + str(t3n_data_cleaned['TA_STEMMED'].nunique())) 

In [None]:
lemmatizer = WordNetLemmatizer()
nltk.download('wordnet')
cmpl_data_cleaned['TA_STEMMED'] = cmpl_data_cleaned['TA_NORMALIZED'].map(lambda token: lemmatizer.lemmatize(str(token)))
print("cmpl lexica size before stemming: " + str(cmpl_data_cleaned['TA_TOKEN'].nunique()))
print("cmpl lexica size after stemming: " + str(cmpl_data_cleaned['TA_STEMMED'].nunique())) 

### Mean Document / Sentence length

In [None]:
mean_cmpl_doc_length = cmpl_data["CMPLID"].value_counts().mean()
mean_cmpl_sentence_length = cmpl_data.groupby("CMPLID")["TA_SENTENCE"].value_counts().mean()
print("mean cmpl doc length: " + str(mean_cmpl_doc_length))
print("mean cmpl sentence length: " + str(mean_cmpl_sentence_length))

In [None]:
mean_t3n_doc_length = t3n_data["ID"].value_counts().mean()
mean_t3n_sentence_length = t3n_data.groupby("ID")["TA_SENTENCE"].value_counts().mean()
print("mean t3n doc length: " + str(mean_t3n_doc_length))
print("mean t3n sentence length: " + str(mean_t3n_sentence_length))

## Aufgabe 3

In [None]:
t3n_data_cleaned['FREQUENCY'] = t3n_data_cleaned['TA_TOKEN'].map(t3n_data_cleaned['TA_TOKEN'].value_counts())

In [None]:
# frequent words
sns.set(rc={'figure.figsize':(27,7)})
most_occuring_tokens = t3n_data_cleaned.drop_duplicates(subset='TA_TOKEN').nlargest(20, 'FREQUENCY')
sns.barplot(x="TA_TOKEN", y="FREQUENCY", data=most_occuring_tokens)

In [None]:
# frequent verbs
sns.set(rc={'figure.figsize':(27,7)})
most_occuring_tokens = t3n_data_cleaned[t3n_data_cleaned['TA_TYPE'] == 'verb'].drop_duplicates(subset='TA_TOKEN').nlargest(20, 'FREQUENCY')
sns.barplot(x="TA_TOKEN", y="FREQUENCY", data=most_occuring_tokens)

## Aufgabe 4

In [None]:
t3n_data_cleaned.head()

In [None]:
def get_ambigious_tokens(df, id): 
    filtered = df[df['ID'] == id]
    filtered = filtered.groupby(['TA_TOKEN', 'TA_TYPE']).size().reset_index(name='FREQ')
    grouped = filtered.groupby(['TA_TOKEN'])
    ambigious_tokens = grouped.filter(lambda x: len(x.groupby('TA_TYPE')) > 1)
    ambigious_tokens = ambigious_tokens[['TA_TOKEN', 'TA_TYPE', 'FREQ']]
    return ambigious_tokens

In [None]:
get_ambigious_tokens(t3n_data_cleaned, '480af0e8-de34-4895-90df-337c38a60815')

# Tokenization (obsolete)

In [None]:
nltk.download('punkt')

In [None]:
t3n_data_tokenized = t3n_data
t3n_data_tokenized['text'] = t3n_data_tokenized.apply(lambda row: nltk.word_tokenize(str(row['text'])), axis=1)
t3n_data_tokenized['teaser'] = t3n_data_tokenized.apply(lambda row: nltk.word_tokenize(str(row['teaser'])), axis=1)
t3n_data_tokenized['heading'] = t3n_data_tokenized.apply(lambda row: nltk.word_tokenize(str(row['heading'])), axis=1)

In [None]:
for index, row in t3n_data.iterrows():
    print(row['heading'])