# RQ1: Which technologies have been investigated in the last decade?

## Module importieren

In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from spacy.symbols import NOUN
from spacy.matcher import Matcher
from spacy.util import filter_spans

## Daten einlesen
In diesem Block werden die Daten in eine Pandas DataFrame geladen. Anschließend werden die Spalten zur besseren Übersichtlichkeit umbenannt.

In [2]:
#Datene einlesen
data = pd.read_csv('./data/data.csv',usecols = ['Unnamed: 0','Document Title','Abstract','Publication Year'])
data.columns = ['index','title','year','abstract']
data.index = data['index']

data.drop(["index"], axis = 1, inplace = True)

data.head()

Unnamed: 0_level_0,title,year,abstract
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Is an Athletic Approach the Future of Software...,2016,Traditional software engineering education app...
1,What Do We Know about Knowledge Management? Pr...,2009,There have been many claims about knowledge ma...
2,Global Software Engineering: An Industry Persp...,2016,Professional software products and IT systems ...
3,Advancing Software Engineering Professional Ed...,2011,The paper mentions that a reference curriculum...
4,Improving the State of Automotive Software Eng...,2017,The automotive industry is fundamentally chang...


## Funktion für die TermDokumentenMatrix

In [3]:
def CreateTermDocMatrix(column):
        
    count_vectorizer = CountVectorizer(stop_words='english',ngram_range = (1,2),dtype=np.int32)
    sparse_matrix = count_vectorizer.fit_transform(data[column].values.astype('U'))

    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), dtype=np.int32)
    df['year'] = data['year']
    returndf = df.groupby('year').sum().transpose().astype('int32')
    return returndf

In [4]:
def ShowWord(data,word):
    d = data.loc[word]
    d.plot.bar()

In [5]:
def SpaltenEntf(data):
    y = []
    for item in data.columns:
        y = data.columns


    for index in range(len(y)):
        x = y[index]
        if x in stopwords or x.isnumeric():
            del data[y[index]]
    return data

## Erstellen der TermDokumentenmatrix

In [6]:
def Retokenize(text):
    # Verarbeite den Text
    if type(text) is str:
        doc = nlp(text)


    matches = matcher(doc)
    sp  = [doc[match[1]:match[2]] for match in matches] 

    filtered = filter_spans(sp)
    with doc.retokenize() as retokenizer:
        for s in filtered:
            retokenizer.merge(s, attrs = {"POS": "NOUN"})
    [(token.pos_, token.text) for token in doc]
        
    #Returns spacy doc
    return doc
                
        

In [7]:
from matplotlib import pyplot as plt
years = []

#Balkendiagramm
#plt.bar(range(len(years)),anzah)

#Titel
#plt.title("Toller Plot")
#plt.ylabel("# of awards")

#plt.xticks(range(len(years)),years)
#plt.show()

## NLP Model laden

In [8]:
nlp = spacy.load("en_core_web_sm")

## Counter und Matcher initialisieren

In [9]:
matcher = Matcher(nlp.vocab)
words = Counter()
words_chunk = Counter()

## Pattern festlegen und dem Matcher hinzufügen

In [10]:
pattern_test = [{'POS': 'NOUN'},
                {'POS': 'NOUN'}]

pattern_exmaple = [{'POS': 'ADJ', 'OP': '?'},
                   {'OP': '+', 'POS': 'NOUN'},
                   {'POS': 'NOUN', 'OP': '?'}]

In [11]:
matcher.add("match_test", [pattern_test])
matcher.add("match_example", [pattern_exmaple])

In [12]:
for columns in data.itertuples():
    text = columns[3] #1 = title 2 = year 3 = abstract
    # Verarbeite den Text
    if type(text) is str:
            doc = nlp(text)
    
    matches = matcher(doc)
    sp  = [doc[match[1]:match[2]] for match in matches]
    filtered = filter_spans(sp)
    with doc.retokenize() as retokenizer:
        for s in filtered:
            retokenizer.merge(s, attrs = {"POS": "NOUN"})
    [(token.pos_, token.text) for token in doc]

    for token in doc:
        # Greife auf den Text, die Wortart und die Dependenzrelation des Tokens zu
        token_text = token.text
        token_pos = token.pos_
        token_dep = token.dep_
        token_ent = token.ent_type_

        if token.pos == NOUN:
            if not token.is_stop:
                words[token_text] += 1
                                          

## Wörter darstellen

In [13]:
words.most_common(100)

[('paper', 712),
 ('approach', 634),
 ('developers', 580),
 ('results', 458),
 ('software', 365),
 ('code', 317),
 ('article', 314),
 ('percent', 279),
 ('system', 267),
 ('model', 266),
 ('time', 259),
 ('%', 256),
 ('techniques', 241),
 ('data', 235),
 ('systems', 234),
 ('state', 208),
 ('set', 207),
 ('software engineering', 198),
 ('number', 193),
 ('performance', 192),
 ('study', 188),
 ('analysis', 185),
 ('tools', 185),
 ('work', 181),
 ('problem', 177),
 ('effectiveness', 171),
 ('framework', 171),
 ('models', 171),
 ('development', 169),
 ('terms', 169),
 ('method', 167),
 ('researchers', 163),
 ('approaches', 162),
 ('projects', 161),
 ('users', 160),
 ('use', 158),
 ('quality', 153),
 ('process', 152),
 ('practitioners', 151),
 ('application', 149),
 ('applications', 148),
 ('technique', 148),
 ('tool', 147),
 ('context', 144),
 ('research', 143),
 ('challenges', 138),
 ('requirements', 138),
 ('methods', 132),
 ('order', 130),
 ('issue', 129),
 ('programs', 128),
 ('testin

## Liste erstellen

In [14]:
l = list(words.most_common(30))
wordlist = []
for x in l:
    wordlist.append(x[0])


In [15]:
words = pd.Series(wordlist,dtype=str)

In [16]:
abstract = CreateTermDocMatrix('abstract')

In [17]:
b = abstract[abstract.index.isin(words)]

In [18]:
b.head()

year,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
analysis,36,32,22,36,54,31,52,51,53,49,102,130,61
approach,36,35,31,68,87,68,75,75,85,82,111,181,60
article,22,7,12,13,8,13,37,36,23,90,27,21,8
code,40,31,34,58,60,65,90,62,79,87,168,275,167
data,36,28,17,36,91,38,48,87,56,60,92,113,57


## Balkendiagramm erstellen

In [19]:
import altair as alt
import pandas as pd

source = pd.DataFrame({'Technologies': b.index,
                       'Anzahl': b[2020]})



alt.Chart(source).mark_bar().encode(
    x='Technologies',
    y='Anzahl'
)


In [20]:
print(cosine_similarity(b.transpose()))

[[1.         0.97457953 0.9660127  0.96074184 0.95607819 0.96129787
  0.93543892 0.97223078 0.92371024 0.93716621 0.90585318 0.84859095
  0.83467648]
 [0.97457953 1.         0.97296525 0.97234678 0.97737679 0.97511477
  0.94352211 0.96673778 0.93148743 0.93175637 0.93300216 0.8918113
  0.86573226]
 [0.9660127  0.97296525 1.         0.96849395 0.9679746  0.97581826
  0.94528654 0.94939307 0.93695    0.91986847 0.91971676 0.88733471
  0.87624028]
 [0.96074184 0.97234678 0.96849395 1.         0.98395503 0.98192556
  0.97245839 0.97116567 0.96376705 0.95702659 0.95695472 0.92790086
  0.90242476]
 [0.95607819 0.97737679 0.9679746  0.98395503 1.         0.98344257
  0.96935088 0.97980445 0.96472856 0.95095363 0.95598283 0.92427778
  0.89509642]
 [0.96129787 0.97511477 0.97581826 0.98192556 0.98344257 1.
  0.98066979 0.97613089 0.97901272 0.95839582 0.9631255  0.9380985
  0.91451139]
 [0.93543892 0.94352211 0.94528654 0.97245839 0.96935088 0.98066979
  1.         0.97107916 0.98289202 0.97365