# RQ1: Which technologies have been investigated in the last decade?

## Module importieren

In [None]:
import pandas as pd
import numpy as np
from collections import Counter
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from spacy.symbols import NOUN
from spacy.matcher import Matcher
from spacy.util import filter_spans

## Daten einlesen
In diesem Block werden die Daten in eine Pandas DataFrame geladen. Anschließend werden die Spalten zur besseren Übersichtlichkeit umbenannt.

In [None]:
#Datene einlesen
data = pd.read_csv('./data/data.csv',usecols = ['Unnamed: 0','Document Title','Abstract','Publication Year'])
data.columns = ['index','title','year','abstract']
data.index = data['index']

data.drop(["index"], axis = 1, inplace = True)

data.head()

data.groupby('year').count()

## Funktion für die TermDokumentenMatrix

In [None]:
def CreateTermDocMatrix(column):
        
    count_vectorizer = CountVectorizer(stop_words='english',ngram_range = (1,2),dtype=np.int32)
    sparse_matrix = count_vectorizer.fit_transform(data[column].values.astype('U'))

    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), dtype=np.int32)
    df['year'] = data['year']
    returndf = df.groupby('year').sum().transpose().astype('int32')
    return returndf

In [None]:
def ShowWord(data,word):
    d = data.loc[word]
    d.plot.bar()

In [None]:
def SpaltenEntf(data):
    y = []
    for item in data.columns:
        y = data.columns


    for index in range(len(y)):
        x = y[index]
        if x in stopwords or x.isnumeric():
            del data[y[index]]
    return data

## Erstellen der TermDokumentenmatrix

In [None]:
def Retokenize(text):
    # Verarbeite den Text
    if type(text) is str:
        doc = nlp(text)


    matches = matcher(doc)
    sp  = [doc[match[1]:match[2]] for match in matches] 

    filtered = filter_spans(sp)
    with doc.retokenize() as retokenizer:
        for s in filtered:
            retokenizer.merge(s, attrs = {"POS": "NOUN"})
    [(token.pos_, token.text) for token in doc]
        
    #Returns spacy doc
    return doc
                
        

In [None]:
from matplotlib import pyplot as plt
years = []

#Balkendiagramm
#plt.bar(range(len(years)),anzah)

#Titel
#plt.title("Toller Plot")
#plt.ylabel("# of awards")

#plt.xticks(range(len(years)),years)
#plt.show()

## NLP Model laden

In [None]:
nlp = spacy.load("en_core_web_sm")

## Counter und Matcher initialisieren

In [None]:
matcher = Matcher(nlp.vocab)
words = Counter()
words_chunk = Counter()

## Pattern festlegen und dem Matcher hinzufügen

In [None]:
pattern_test = [{'POS': 'NOUN'},
                {'POS': 'NOUN'}]

pattern_exmaple = [{'POS': 'ADJ', 'OP': '?'},
                   {'OP': '+', 'POS': 'NOUN'},
                   {'POS': 'NOUN', 'OP': '?'}]

In [None]:
matcher.add("match_test", [pattern_test])
matcher.add("match_example", [pattern_exmaple])

In [None]:
for columns in data.itertuples():
    text = columns[3] #1 = title 2 = year 3 = abstract
    # Verarbeite den Text
    if type(text) is str:
            doc = nlp(text)
    
    matches = matcher(doc)
    sp  = [doc[match[1]:match[2]] for match in matches]
    filtered = filter_spans(sp)
    with doc.retokenize() as retokenizer:
        for s in filtered:
            retokenizer.merge(s, attrs = {"POS": "NOUN"})
    [(token.pos_, token.text) for token in doc]

    for token in doc:
        # Greife auf den Text, die Wortart und die Dependenzrelation des Tokens zu
        token_text = token.text
        token_pos = token.pos_
        token_dep = token.dep_
        token_ent = token.ent_type_

        if token.pos == NOUN:
            if not token.is_stop:
                words[token_text] += 1
                                          

## Wörter darstellen

In [None]:
words.most_common(100)

## Liste erstellen

In [None]:
l = list(words.most_common(30))
wordlist = []
for x in l:
    wordlist.append(x[0])


In [None]:
words = pd.Series(wordlist,dtype=str)

In [None]:
abstract = CreateTermDocMatrix('abstract')

In [None]:
b = abstract[abstract.index.isin(words)]

In [None]:
b.head()

## Balkendiagramm erstellen

In [None]:
import altair as alt
import pandas as pd

source = pd.DataFrame({'Technologies': b.index,
                       'Anzahl': b[2020]})



alt.Chart(source).mark_bar().encode(
    x='Technologies',
    y='Anzahl'
)


In [None]:
print(cosine_similarity(b.transpose()))