In [None]:
from hdbcli import dbapi
import pandas as pd
import seaborn as sns

In [None]:
connection = dbapi.connect('18.202.247.58', 39015, 'SYSTEM', 'Glorp2018!')
connection.isconnected()

In [None]:
cursor = connection.cursor()

# Teil 2 mit SQL
## Aufgaben 1
### SQL View

In [None]:
# drop SQL-View
sql_drop_view = 'drop view COUNT_NOUNS_NHTSA'
cursor.execute(sql_drop_view)

In [None]:
# nouns per document
sql = 'create view COUNT_NOUNS_NHTSA as select CMPLID, TA_TOKEN, count(*) as COUNT from "$TA_CDESCRIND" where TA_TYPE=\'noun\' group by CMPLID, TA_TOKEN'
cursor.execute(sql)

### Nouns per document

In [None]:
def plot_nouns(nouns_df):
    ax = sns.distplot(nouns_df['count'])
    ax.set(xlabel='#nouns per document')
    ax.set_title('Nouns per document')
    fig = ax.get_figure()
    fig.savefig('plots/nouns_per_document_NHTSA.png')

#### Using SQL-View

In [None]:
# nouns for all documents
cursor.execute('select CMPLID, sum(COUNT) from COUNT_NOUNS_NHTSA group by CMPLID order by sum(COUNT) desc')
nouns_list = cursor.fetchall()
nouns_df = pd.DataFrame(nouns_list)
nouns_df.columns = ['doc', 'count']
nouns_df.head()

In [None]:
plot_nouns(nouns_df)

In [None]:
# nouns for a specific document
cursor.execute('select * from COUNT_NOUNS_NHTSA')
nouns_list = cursor.fetchall()
nouns_df = pd.DataFrame(nouns_list)
nouns_df.head(20)

## Aufgaben 2
### Size of Lexica

In [None]:
cursor.execute('select count(distinct TA_TOKEN) from "$TA_CDESCRIND"')
nouns_list = cursor.fetchall()
lexica_size = pd.DataFrame(nouns_list)
lexica_size.columns = ['size']
lexica_size

### Size of Lexica without punctuation and stopwords

In [None]:
import nltk as nltk
from nltk.corpus import stopwords

def remove_stopwords(data):
    nltk.download('stopwords')
    german_stopwords = set(stopwords.words('english'))
    data['TOKEN_LOW'] = data['TOKEN'].map(lambda row: str(row).lower())
    filtered = []
    for index, row in data.iterrows():
        if row.TOKEN_LOW not in german_stopwords:
            filtered.append(row.TOKEN_LOW)
    filtered_lexica = pd.DataFrame(filtered)
    filtered_lexica.columns = ['TOKEN']
    return filtered_lexica

In [None]:
cursor.execute('select distinct TA_TOKEN from "$TA_CDESCRIND" where TA_TYPE <> \'punctuation\' and TA_TYPE <> \'number\'')
lexica_list = cursor.fetchall()
lexica_df = pd.DataFrame(lexica_list)
lexica_df.columns = ['TOKEN']
lexica_df.size

In [None]:
filtered_lexica = remove_stopwords(lexica_df)
filtered_lexica.size

### Mean length of a document

In [None]:
# nouns per document
sql = 'create view COUNT_TOKEN_NHTSA as select CMPLID, count(*) as COUNT from "$TA_CDESCRIND" group by CMPLID order by count(*) desc'
cursor.execute(sql)

In [None]:
# drop SQL-View
sql_drop_view = 'drop view COUNT_TOKEN_NHTSA'
cursor.execute(sql_drop_view)

In [None]:
cursor.execute('select avg(COUNT) from COUNT_TOKEN_NHTSA')
mean_length_doc = cursor.fetchall()
mean_length_doc = pd.DataFrame(mean_length_doc)
mean_length_doc.columns = ['AVERAGE']
mean_length_doc

### Mean length of a sentence

In [None]:
cursor.execute('select AVG(TA_SENTENCE) as AVG_SENTENCE from "$TA_CDESCRIND" order by AVG(TA_SENTENCE) desc')
mean_length_sentence = cursor.fetchall()
mean_length_sentence_df = pd.DataFrame(mean_length_sentence)
mean_length_sentence_df_new = mean_length_sentence_df.dropna()
mean_length_sentence_df_new.columns = ['AVERAGE']
mean_length_sentence_df_new.head()

## Aufgaben 3
### Word frequency

In [None]:
cursor.execute('select TA_TOKEN, count(*) from "$TA_CDESCRIND" group by TA_TOKEN order by count(*) desc')
words_list = cursor.fetchall()
words_df = pd.DataFrame(words_list)
words_df.columns = ['TOKEN', 'count']
words_df

In [None]:
# with filtered_lexica (removed punctuation and number)
cursor.execute('select TA_TOKEN, count(*) from "$TA_CDESCRIND" where TA_TYPE <> \'punctuation\' and TA_TYPE <> \'number\' group by TA_TOKEN order by count(*) desc')
lexica_list = cursor.fetchall()
lexica_df = pd.DataFrame(lexica_list)
lexica_df.columns = ['TOKEN', 'count']
lexica_df

### Frequent and rare words

In [None]:
# frequent words
cursor.execute('select TA_TOKEN, count(*) from "$TA_CDESCRIND" where TA_TYPE = \'noun\' or TA_TYPE = \'verb\' or TA_TYPE = \'adjective\' group by TA_TOKEN order by count(*) desc')
words_list = cursor.fetchall()
words_df = pd.DataFrame(words_list)
words_df.columns = ['TA_TOKEN', 'count']
sns.set(rc={'figure.figsize':(27,7)})
ax = sns.barplot(x="TA_TOKEN", y="count", data=words_df.head(20))
fig = ax.get_figure()
fig.savefig('plots/barplot_words_freq_NHTSA.png')

In [None]:
# rare words
words_df.tail(20)

### Restriction to POS-Tags, categories or documents

In [None]:
# restriction to nouns
# barplot
cursor.execute('select top 20 TA_TOKEN, count(*) from "$TA_CDESCRIND" where TA_TYPE=\'noun\' group by TA_TOKEN order by count(*) desc')
results_words = cursor.fetchall()
results_words_df = pd.DataFrame(results_words)
results_words_df.columns = ['TA_TOKEN', 'count']
sns.set(rc={'figure.figsize':(27,7)})
ax = sns.barplot(x="TA_TOKEN", y="count", data=results_words_df)
fig = ax.get_figure()
fig.savefig('plots/barplot_nouns_NHTSA.png')

In [None]:
from wordcloud import WordCloud 
import matplotlib.pyplot as plt
tmpDict = {}
cursor.execute('SELECT top 20 TA_TOKEN, count(*) from "$TA_CDESCRIND" where TA_TYPE=\'noun\' group by TA_TOKEN order by count(*) desc')
for row in cursor: 
    tmpDict[row[0]]=row[1]
wordcloud = WordCloud(width=480, height=480, margin=0).generate_from_frequencies(tmpDict) 
# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()
wordcloud.to_file('plots/wordcloud_nouns_NHTSA.png')

## Aufgaben 4

### Ambiguity of words

In [None]:
# create view
cursor.execute('create view POSTAGS_NHTSA as select TA_NORMALIZED as nor, CMPLID as id, TA_TYPE as type, count(*) as count_words from "$TA_CDESCRIND" where TA_TYPE = \'noun\' or TA_TYPE = \'verb\' or TA_TYPE = \'adjective\' group by TA_NORMALIZED, CMPLID, TA_TYPE having count(*)>1')

In [None]:
# drop SQL-View
sql_drop_view = 'drop view POSTAGS_NHTSA'
cursor.execute(sql_drop_view)

In [None]:
cursor.execute('select NOR, ID, count(NOR) from POSTAGS_NHTSA group by NOR, ID having count(NOR) > 1')
words_list = cursor.fetchall()
words_df = pd.DataFrame(words_list)
df = words_df.dropna()
df.head(30)

## Aufgaben 5
### 1. Statistics
#### The distribution of top part-of-speech tags

In [None]:
from textblob import TextBlob
import nltk as nltk

cursor.execute('select CDESCR from "CMPL100K" where CMPLID = \'112392\'')
token_results = cursor.fetchall()
token_results
token_results_df = pd.DataFrame(token_results)
token_results_df

In [None]:
token_results_df.columns = ['CDESCR']

blob = TextBlob(str(token_results_df['CDESCR']))
blob.sentiment