# RQ1: Which technologies have been investigated in the last decade?

## Module importieren

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from spacy.symbols import NOUN,X
from spacy.matcher import Matcher
from spacy.util import filter_spans

## Daten einlesen
In diesem Block werden die Daten in eine Pandas DataFrame geladen. Anschließend werden die Spalten zur besseren Übersichtlichkeit umbenannt.

In [2]:
#Datene einlesen
data = pd.read_csv('./data/data.csv',usecols = ['Unnamed: 0','Document Title','Abstract','Publication Year'])
data.columns = ['index','title','year','abstract']
data.index = data['index']

data.drop(["index"], axis = 1, inplace = True)


## NLP Model laden

In [3]:
nlp = spacy.load("en_core_web_sm")

## Matcher initialisieren

In [4]:
matcher = Matcher(nlp.vocab)

## Pattern festlegen und dem Matcher hinzufügen

In [5]:
pattern_test = [{'POS': 'ADJ', 'OP': '?'},
                {'POS': 'NOUN'},
                {'POS': 'NOUN'}]

pattern_exmaple = [{'POS': 'ADJ', 'OP': '?'},
                   {'OP': '+', 'POS': 'NOUN'},
                   {'POS': 'NOUN', 'OP': '?'}]

In [6]:
matcher.add("match_test", [pattern_test])
#matcher.add("match_example", [pattern_exmaple])

## Funktion für die TermDokumentenMatrix

In [7]:
def CreateTermDocMatrix(column):
        
    #stop_words='english',ngram_range = (1,2),preprocessor = my_preprocessor, 
    count_vectorizer = CountVectorizer(preprocessor = my_preprocessor,  tokenizer = Retokenize,dtype=np.int32)
    sparse_matrix = count_vectorizer.fit_transform(data[column].values.astype('U'))

    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), dtype=np.int32)
    df['year'] = data['year']
    returndf = df.groupby('year').sum().transpose().astype('int32')
    return returndf

In [8]:
def ShowWord(data,word):
    d = data.loc[word]
    d.plot.bar()

In [9]:
def SpaltenEntf(data):
    y = []
    for item in data.columns:
        y = data.columns


    for index in range(len(y)):
        x = y[index]
        if x in stopwords or x.isnumeric():
            del data[y[index]]
    return data

In [10]:
def Retokenize(text):
    doc = nlp(text)

    matches = matcher(doc)
    sp  = [doc[match[1]:match[2]] for match in matches] 

    filtered = filter_spans(sp)
    with doc.retokenize() as retokenizer:
        for s in filtered:
            retokenizer.merge(s, attrs = {"POS": "NOUN"})
    return [(token.text) for token in doc]
        

In [11]:
def my_preprocessor(text):
    result = ''
    doc = nlp(str(text))
    
    for token in doc:
        if not token.is_digit:
            if not token.is_punct:
                if not token.like_num:
                    if not token.is_stop:
                        if not token.is_space:
                            if token.pos != X:
                                result += token.lower_
        if len(result) != 0:
            if result[-1] != ' ':
                result += ' '
    return result

## Term Dokumenten Matrix erstellen

In [12]:
abstract = CreateTermDocMatrix('abstract')

## Top 5 Themen

In [13]:
stopwords = [' ', '  ']

In [14]:
anzahl_top_topics = 50

In [15]:
#data_words = abstract.drop(stopwords, axis = 0)
data_words = abstract

In [16]:
for year in range(2009, 2022,1):
    print(data_words[year].nlargest(anzahl_top_topics))
    print('--------------------------------------------')


based                   41
systems                 26
software                21
existing                20
provide                 20
engineering             19
results                 19
driven                  18
present                 18
software engineering    18
model                   17
paper                   17
development             16
domain                  16
found                   16
methods                 16
new                     16
approach                15
developers              15
embedded                15
models                  15
processes               15
metrics                 14
help                    13
provides                13
studies                 13
formal                  12
proposed                12
support                 12
techniques              12
tools                   12
use                     12
analysis                11
design                  11
end                     11
oriented                11
requirements            11
r

In [17]:
abstract.head(50)


year,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
"""",0,0,0,0,0,0,0,0,0,0,1,2,0
$,0,0,0,0,0,2,0,9,2,2,4,4,6
'll,0,0,0,0,1,0,0,0,0,0,0,0,0
+,2,0,0,0,1,0,0,0,0,0,0,0,0
",",0,0,0,0,0,0,0,0,0,0,0,1,1
-calculus,0,0,0,0,0,0,0,1,0,0,0,0,0
-fold,0,0,0,0,0,0,0,0,1,0,0,0,0
-ilities healthcare,0,0,0,0,0,0,0,1,0,0,0,0,0
-ility,0,0,0,0,0,0,0,1,0,0,0,0,0
-inspired,0,0,0,0,0,0,0,0,0,0,1,0,0


## Counter initialisieren

In [18]:
words = Counter()
words_chunk = Counter()

In [19]:
#for columns in data.itertuples():
#    text = columns[3] #1 = title 2 = year 3 = abstract
    # Verarbeite den Text
#    if type(text) is str:
#            doc = nlp(text)
    
#    matches = matcher(doc)
#    sp  = [doc[match[1]:match[2]] for match in matches]
#    filtered = filter_spans(sp)
#    with doc.retokenize() as retokenizer:
#        for s in filtered:
#            retokenizer.merge(s, attrs = {"POS": "NOUN"})
#    [(token.pos_, token.text) for token in doc]

#    for token in doc:
        # Greife auf den Text, die Wortart und die Dependenzrelation des Tokens zu
#        token_text = token.text
#        token_pos = token.pos_
#        token_dep = token.dep_
#        token_ent = token.ent_type_

#        if token.pos == NOUN:
#            if not token.is_stop:
#                words[token_text] += 1
                                          

## Wörter darstellen

In [20]:
print(type(abstract.sum(axis = 0)))

<class 'pandas.core.series.Series'>


## Liste erstellen

In [21]:
l = list(words.most_common(30))
wordlist = []
for x in l:
        wordlist.append(x[0])


In [22]:
words = abstract.sum(axis= 1).nlargest(20)

In [None]:
b = abstract[abstract.index.isin(words.index)] 

In [None]:
b

## Balkendiagramm erstellen

In [None]:
import altair as alt
import pandas as pd

for year in range(2009, 2022,1):
    source = pd.DataFrame({'Technologies': b.index,
                           'Anzahl': b[year]})



    alt.Chart(source).mark_bar().encode(
        x='Technologies',
        y='Anzahl'
    )


In [None]:
#print(cosine_similarity(b))