In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
warnings.simplefilter("ignore", category=FutureWarning)

In [2]:
#Import all necessary libraries
import glob
import pandas as pd
import numpy as np

from gensim.corpora import Dictionary, MmCorpus
from gensim.models.ldamulticore import LdaMulticore

import fastparquet
import pyLDAvis
import pyLDAvis.gensim
import warnings
import _pickle as pickle

import re
import random
import collections
import seaborn as sns

from gensim import corpora, models
import gensim
import pyLDAvis.gensim

from wordcloud import WordCloud
import matplotlib.pyplot as plt
from scipy import stats

pyLDAvis.enable_notebook()
random.seed(1234)

from sklearn.metrics.cluster import adjusted_rand_score
pd.options.display.max_columns = None

# 1. Topic Model on CommonCrawl


## 1.1. Loading and Filtering the Data
There are 28 files with ~2500 records each. 

In [3]:
%%time
#Load the parquet files into a single dataframe

result = pd.DataFrame()
#for filename in os.listdir("C:\\tmp\\my.tar\\istjoscha_cc_warc_wat_wet_acht_lang"):
for pth in sorted(glob.glob("data/CC/*")):
    ds = pd.read_parquet(pth)
    #print(pth)
    #print(ds.shape)
    result = result.append(ds)
    #print(result.shape)

#In case you are running out of memory, try to run the filter in the following paragraph on ds before appending

CPU times: user 25.1 s, sys: 12.8 s, total: 37.9 s
Wall time: 39.1 s


In [4]:
#filter on English and university-related content
result = result.loc[(result['Guessed-Language']=='en') & (result['Plaintext'].str.contains("niversity"))]

## 1.2 Constructing the Corpus and Building the Model

In [5]:
#auxiliary function for "straight-forward" natural language preprocessing

def cleanup_text(record):
    text = record['Plaintext']
    # Remove newlines
    text = text.replace(r'\n', ' ')
    words = text.split()

    # Default list of Stopwords
    stopwords_core = ['a', u'about', u'above', u'after', u'again', u'against', u'all', u'am', u'an', u'and', u'any',
                      u'are', u'arent', u'as', u'at',
                      u'be', u'because', u'been', u'before', u'being', u'below', u'between', u'both', u'but', u'by',
                      u'can', 'cant', 'come', u'could', 'couldnt',
                      u'd', u'did', u'didn', u'do', u'does', u'doesnt', u'doing', u'dont', u'down', u'during',
                      u'each',
                      u'few', 'finally', u'for', u'from', u'further',
                      u'had', u'hadnt', u'has', u'hasnt', u'have', u'havent', u'having', u'he', u'her', u'here',
                      u'hers', u'herself', u'him', u'himself', u'his', u'how',
                      u'i', u'if', u'in', u'into', u'is', u'isnt', u'it', u'its', u'itself',
                      u'just',
                      u'll',
                      u'm', u'me', u'might', u'more', u'most', u'must', u'my', u'myself',
                      u'no', u'nor', u'not', u'now',
                      u'o', u'of', u'off', u'on', u'once', u'only', u'or', u'other', u'our', u'ours', u'ourselves',
                      u'out', u'over', u'own',
                      u'r', u're',
                      u's', 'said', u'same', u'she', u'should', u'shouldnt', u'so', u'some', u'such',
                      u't', u'than', u'that', 'thats', u'the', u'their', u'theirs', u'them', u'themselves', u'then',
                      u'there', u'these', u'they', u'this', u'those', u'through', u'to', u'too',
                      u'under', u'until', u'up',
                      u'very',
                      u'was', u'wasnt', u'we', u'were', u'werent', u'what', u'when', u'where', u'which', u'while',
                      u'who', u'whom', u'why', u'will', u'with', u'wont', u'would',
                      u'y', u'you', u'your', u'yours', u'yourself', u'yourselves']

    # Custom List of Stopwords - Add your own here
    stopwords_custom = ['']
    stopwords = stopwords_core + stopwords_custom
    stopwords = [word.lower() for word in stopwords]

    text_out = [re.sub('[^a-zA-Z0-9]', '', word) for word in words]  # Remove special characters
    text_out = [word.lower() for word in text_out if
                len(word) > 2 and word.lower() not in stopwords]  # Remove stopwords and words under X length
    return text_out


In [7]:
%%time
#build gensim corpus

result=result.reset_index()
texts=[]
for index, row in result.iterrows():
    texts.append(cleanup_text(row))

dictionary = corpora.Dictionary(texts)
dictionary.filter_extremes(no_below=10, no_above=0.4)
dictionary.compactify()
corpus = [dictionary.doc2bow(text) for text in texts]

CPU times: user 19.6 s, sys: 242 ms, total: 19.9 s
Wall time: 19.9 s


In [8]:
%%time
ldamodelnormal = gensim.models.ldamodel.LdaModel(corpus, num_topics=15, id2word = dictionary, passes=10, chunksize=100, update_every=0, alpha=1/15, random_state=1)


CPU times: user 3min 21s, sys: 962 ms, total: 3min 22s
Wall time: 34.4 s


## 1.3 Inspecting the Results

In [9]:
# helper to create descriptive tables (doc-topic probabilities) and visualizations for LDA models 
def getModelResults(ldamodel, corpus, dictionary):
    vis = pyLDAvis.gensim.prepare(ldamodel,corpus, dictionary, sort_topics=False)
    transformed = ldamodel.get_document_topics(corpus)
    df = pd.DataFrame.from_records([{v:k for v, k in row} for row in transformed])
    return vis, df    

In [10]:
# get the top topic per document into a list
def maxTop(x):
    mx = max(x,key=lambda item:item[1])
    if (mx[1]>0.0):
        return(mx[0])
    else:
        return 99

In [11]:
%%time
# get descriptive stuff for all models
normalv, dfnormal = getModelResults(ldamodelnormal, corpus, dictionary)


CPU times: user 40.9 s, sys: 1.02 s, total: 41.9 s
Wall time: 3min 31s


In [12]:
#print regular topics (top word probabilities)
ldamodelnormal.print_topics(num_words=8)

[(0,
  '0.005*"source" + 0.005*"research" + 0.005*"data" + 0.004*"care" + 0.004*"college" + 0.004*"authors" + 0.004*"presentation" + 0.004*"journal"'),
 (1,
  '0.015*"download" + 0.005*"thin" + 0.004*"films" + 0.004*"magnetic" + 0.003*"research" + 0.003*"book" + 0.003*"film" + 0.003*"international"'),
 (2,
  '0.009*"june" + 0.009*"july" + 0.009*"2017" + 0.007*"february" + 0.007*"january" + 0.007*"march" + 0.007*"october" + 0.007*"september"'),
 (3,
  '0.004*"people" + 0.003*"business" + 0.003*"best" + 0.003*"life" + 0.002*"free" + 0.002*"make" + 0.002*"day" + 0.002*"back"'),
 (4,
  '0.010*"school" + 0.010*"college" + 0.008*"ago" + 0.005*"state" + 0.004*"high" + 0.004*"elementary" + 0.004*"years" + 0.003*"day"'),
 (5,
  '0.003*"free" + 0.003*"design" + 0.003*"data" + 0.002*"share" + 0.002*"2017" + 0.002*"years" + 0.002*"dad" + 0.002*"download"'),
 (6,
  '0.036*"architects" + 0.012*"health" + 0.009*"public" + 0.007*"name" + 0.006*"english" + 0.004*"form" + 0.004*"board" + 0.003*"city"'),

# Teil I - Aufgabe 1
## Für welche Topics können Sie intuitiv Überbegriffe bilden? Notieren Sie sich diese bzw. legen Sie eine entsprechende „lookup-tabelle“ als Datenstruktur an. Welche Topics erscheinen sinnvoll, welche nicht?

In [None]:
topics = {
    0:'Research', 
    1: None,
    2: 'Month',
    3: 'Life',
    4: 'Education', 
    5: 'Medicine',
    6: 'Architect', 
    7: 'Politics', 
    8: 'Country', 
    9: 'Education', 
    10: 'Month', 
    11: 'Technology',
    12: None,
    13: 'Government',
    14: 'Temperature'
}

In [None]:
# create a wordcloud for topic: adult content


# get the top topic for a known adult content document 
docTopTopics = [maxTop(x) for x in ldamodelnormal.get_document_topics(corpus)]
adultTopicId = docTopTopics[15]


#gather most relevant terms for the given topic
topics_terms = ldamodelnormal.state.get_lambda()
tmpDict = {}
for i in range(1, len(topics_terms[0])):
    tmpDict[ldamodelnormal.id2word[i]]=topics_terms[adultTopicId,i]


# draw the wordcloud
wordcloud = WordCloud( margin=0,max_words=20 ).generate_from_frequencies(tmpDict)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()
print("Adult Topic Id = " + str(adultTopicId))

In [None]:
# inspect regular topics - ATTENTION: ALL TOPICS ARE SHIFTED WITH ID +1 w.r.t. GENSIM
normalv

In [None]:
#inspect first 20 documents and their topic distributions
pd.set_option('display.max_colwidth', -1)
pd.concat([result['Target-URI'], dfnormal], axis=1).iloc[0:20]


# 2. Topic Model on NHTSA (Using HANA Text Analysis Index Table)
## 2.1  Constructing the Corpus (Using HANA Text Analysis Index Table) and Building the Model¶

In [None]:
#data ingestion and filtering

cmpl = pd.read_csv("data/NHTSA_HANA/cmpl.csv", header=None,  keep_default_na=False)
ta = pd.read_csv("data/NHTSA_HANA/data.csv", header=None,  keep_default_na=False)

ta.columns = ["CMPLID", "TA_RULE","TA_COUNTER","TA_TOKEN","TA_LANGUAGE","TA_TYPE","TA_TYPE_EXPANDED",
              "TA_NORMALIZED" ,"TA_STEM","TA_PARAGRAPH","TA_SENTENCE","TA_CREATED_AT","TA_OFFSET","TA_PARENT" ]
cmpl.columns = ["CMPLID", "ODINO", "MFR_NAME", "MAKETEXT", "MODELTXT", "YEARTXT", "CRASH", "FAILDATE", "FIRE", "INJURED", "DEATHS", "COMPDESC", "CITY", "STATE", "VIN", "DATEA", "LDATE", "MILES", "OCCURRENCES", "CDESCR", "CMPL_TYPE", "POLICE_RPT_VN", "PURCH_DT", "ORIG_OWNER_YN", "ANTI_BRAKES_YN", "CRUISE_CONT_YN", "NUM_CYLS", "DRIVE_TRAIN", "FUEL_SYS", "FUEL_TYPE", "TRANS_TYPE", "VEH_SPEED", "DOT", "TIRE_SIZE", "LOC_OF_TIRE", "TIRE_FAIL_TYPE", "ORIG_EQUIP_YN", "MANUF_DT", "SEAT_TYPE", "RESTRAINT_TYPE", "DEALER_NAME", "DEALER_TEL", "DEALER_CITY", "DEALER_STATE", "DEALER_ZIP", "PROD_TYPE", "REPAIRED_YN", "MEDICAL_ATTN", "VEHICELS_TOWED_YN"]


In [None]:
#filtering and aggregation

ta = ta[ta.TA_TYPE.isin(['noun', 'adjective']) ]
ta.TA_TOKEN = ta.TA_TOKEN.str.lower()
ta = ta[~ta.TA_TOKEN.isin(['car', 'vehicle']) ] #use as additional stop words
cmpl = cmpl[cmpl.COMPDESC.isin([ 'AIR BAGS','VISIBILITY/WIPER','EXTERIOR LIGHTING','FUEL/PROPULSION SYSTEM', 'SERVICE BRAKES','WHEELS']) ]

ta = ta.merge(cmpl.loc[:,'CMPLID'], on=['CMPLID'], how='inner')
tagrouped = ta.groupby('CMPLID')['TA_TOKEN'].apply(list)

In [None]:
#show category counts
relevantComplaints = cmpl[cmpl.CMPLID.isin(tagrouped.index.tolist())].sort_values(by=['CMPLID'])
counter=collections.Counter(relevantComplaints.COMPDESC)
print(counter)
print("Anzahl Dokumente: " + str(len(relevantComplaints)))

In [None]:
#create gensim data structure
nhtsadic = corpora.Dictionary(tagrouped.tolist())
nhtsadic.filter_extremes(no_below=10, no_above=0.4)
nhtsadic.compactify()
nhtsacorpus = [nhtsadic.doc2bow(text) for text in tagrouped]

In [None]:
%%time

#build topic model and create descriptive stuff
nhtsalda = LdaMulticore(nhtsacorpus, num_topics=10, id2word = nhtsadic, passes=20, alpha=0.0000001, random_state=1)
nhtsavis, dfnhtsa = getModelResults(nhtsalda, nhtsacorpus, nhtsadic)

## 2.2 Inspecting the Results

In [None]:
nhtsavis