# RQ1: Which technologies have been investigated in the last decade?

## Module importieren

In [1]:
import pandas as pd
import altair as alt
import numpy as np
from collections import Counter
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from spacy.symbols import NOUN,X,VERB,ADJ,SYM,NUM,ADV,ADP,CCONJ
from spacy.matcher import Matcher
from spacy.util import filter_spans

## Daten einlesen
In diesem Block werden die Daten in eine Pandas DataFrame geladen. Anschließend werden die Spalten zur besseren Übersichtlichkeit umbenannt.

In [2]:
#Datene einlesen
data = pd.read_csv('./data/data.csv',usecols = ['Unnamed: 0','Document Title','Abstract','Publication Year'])
data.columns = ['index','title','year','abstract']
data.index = data['index']

data.drop(["index"], axis = 1, inplace = True)


## NLP Model laden

In [3]:
nlp = spacy.load("en_core_web_sm")

## Matcher initialisieren

In [4]:
matcher = Matcher(nlp.vocab)

## Pattern festlegen und dem Matcher hinzufügen

In [5]:
pattern_noun_propn = [{'POS': 'PROPN', 'OP': '?'},
                      {'POS': 'NOUN'},
                      {'POS': 'PROPN', 'OP': '?'}]

In [6]:
pattern_nouns = [{'POS': 'NOUN'},
                 {'POS': 'NOUN'}]

In [7]:
pattern_propn = [{'POS': 'PROPN'},
                 {'POS': 'PROPN'}]

In [8]:
pattern_adj_noun = [{'POS': 'ADJ'},
                    {'POS': 'NOUN'}]

In [9]:
pattern_verb_verb = [{'POS': 'VERB'},
                    {'POS': 'VERB'}]

In [10]:
pattern_adv_noun = [{'POS': 'ADV'},
                    {'POS': 'NOUN'}]

In [11]:
pattern_verb_noun = [{'POS': 'VERB', 'OP': '?'},
                      {'POS': 'NOUN'},
                      {'POS': 'VERB', 'OP': '?'}]

In [12]:
matcher.add("match_propn", [pattern_propn])
matcher.add("match_noun_propn", [pattern_noun_propn])
matcher.add("match_nouns", [pattern_nouns])
matcher.add("match_adj_noun", [pattern_adj_noun])
matcher.add("match_verb_verb", [pattern_verb_verb])
matcher.add("match_adv_noun", [pattern_adv_noun])
matcher.add("match_verb_noun", [pattern_verb_noun])

## Funktion für die TermDokumentenMatrix

In [13]:
def CreateTermDocMatrix(column):
        
    #stop_words='english',ngram_range = (1,2),preprocessor = my_preprocessor, 
    count_vectorizer = CountVectorizer(preprocessor = my_preprocessor,  tokenizer = Retokenize,dtype=np.int32)
    sparse_matrix = count_vectorizer.fit_transform(data[column].values.astype('U'))

    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), dtype=np.int32)
    df['year'] = data['year']
    returndf = df.groupby('year').sum().transpose().astype('int32')
    return returndf

In [14]:
def ShowWord(data,word):
    d = data.loc[word]
    d.plot.bar()

In [15]:
def Retokenize(text):
    doc = nlp(text)

    matches = matcher(doc)
    sp  = [doc[match[1]:match[2]] for match in matches] 

    filtered = filter_spans(sp)
    with doc.retokenize() as retokenizer:
        for s in filtered:
            retokenizer.merge(s, attrs = {"POS": "NOUN"})
    return [(token.text) for token in doc]

In [16]:
def my_postprocessor(token):
    
    if not token.is_digit:
        if not token.is_punct:
            if not token.like_num:
                if not token.is_stop:
                    if not token.is_space:
                        if token.pos != X:
                            if token.pos != VERB:
                                if token.pos != ADJ:
                                        if token.pos != SYM:
                                            if token.pos != NUM:
                                                return token
    return result
        

In [17]:
def my_preprocessor(text):
    result = ''
    doc = nlp(str(text))
    
    for token in doc:
        if not token.is_digit:
            if not token.is_punct:
                if not token.like_num:
                    if not token.is_stop:
                        if not token.is_space:
                            if token.pos != X:
                                if token.pos != VERB:
                                    if token.pos != ADJ:
                                        if token.pos != ADV:
                                            if token.pos != ADP:
                                                if token.pos != SYM:
                                                    if token.pos != NUM:
                                                        if token.pos != CCONJ:
                                                            result += token.lemma_
        if len(result) != 0:
            if result[-1] != ' ':
                result += ' '
    return result

## Term Dokumenten Matrix erstellen

In [18]:
abstract = CreateTermDocMatrix('abstract')

In [19]:
tfidf =TfidfTransformer()

In [20]:
tfidf.fit_transform(abstract.transpose())

<13x38601 sparse matrix of type '<class 'numpy.float64'>'
	with 48138 stored elements in Compressed Sparse Row format>

In [21]:
tfidf.idf_

array([2.54044504, 2.94591015, 2.94591015, ..., 2.94591015, 2.94591015,
       2.94591015])

## Top 5 Themen

In [22]:
stopwords = ['software engineering', 'system','software development','result','software','despite','one','Software Engineering','metric','up']

In [23]:
anzahl_top_topics = 15

In [24]:
data_words = abstract.drop(stopwords, axis = 0)
#data_words = abstract

In [25]:
for year in range(2009, 2022,1):
    d = data_words[year].nlargest(anzahl_top_topics)
    print(d)
    print('--------------------------------------------')
    



   


software system          12
case study                9
software architecture     7
OO                        6
code review               5
source software           5
IEEE Software             4
backend process           4
coverage datum            4
development practice      4
end user                  4
engineering               4
language engineering      4
metamodel                 4
metric change             4
Name: 2009, dtype: int32
--------------------------------------------
product line             10
software system          10
software product          9
software architecture     8
exception handling        7
case study                5
process modeling          5
software quality          5
test case                 5
bug report                4
development               4
program behavior          4
program input             4
security mechanism        4
test suite                4
Name: 2010, dtype: int32
--------------------------------------------
software system     

## Balkendiagramme erstellen

### Balkendiagramm 2009

In [26]:
year = 2009
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2010

In [27]:
year = 2010
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

## Balkendiagramm 2011

In [28]:
year = 2011
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2012

In [29]:
year = 2012
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2013

In [30]:
year = 2013
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2014

In [31]:
year = 2014
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2015

In [32]:
year = 2015
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2016

In [33]:
year = 2016
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2017

In [34]:
year = 2017
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2018

In [35]:
year = 2018
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2019

In [36]:
year = 2019
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2020

In [37]:
year = 2020
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2021

In [38]:
year = 2021
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        #x='Technologies:O',
        alt.X('Technologies:O'),
        alt.Y('Anzahl:Q', sort='x')
        #y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

In [39]:
#for columns in data.itertuples():
#    text = columns[3] #1 = title 2 = year 3 = abstract
    # Verarbeite den Text
#    if type(text) is str:
#            doc = nlp(text)
    
#    matches = matcher(doc)
#    sp  = [doc[match[1]:match[2]] for match in matches]
#    filtered = filter_spans(sp)
#    with doc.retokenize() as retokenizer:
#        for s in filtered:
#            retokenizer.merge(s, attrs = {"POS": "NOUN"})
#    [(token.pos_, token.text) for token in doc]

#    for token in doc:
        # Greife auf den Text, die Wortart und die Dependenzrelation des Tokens zu
#        token_text = token.text
#        token_pos = token.pos_
#        token_dep = token.dep_
#        token_ent = token.ent_type_

#        if token.pos == NOUN:
#            if not token.is_stop:
#                words[token_text] += 1
                                          

## Wörter darstellen

In [40]:
print(type(abstract.sum(axis = 0)))

<class 'pandas.core.series.Series'>


## Liste erstellen

In [41]:
words = data_words.sum(axis= 1).nlargest(20)

In [42]:
b = data_words[data_words.index.isin(words.index)] 

In [43]:
b

year,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021
IEEE Software,4,1,4,4,15,9,1,5,2,3,1,1,2
Stack Overflow,0,0,0,0,0,0,0,2,5,0,6,10,17
approach,2,1,2,4,6,5,6,3,7,3,3,10,5
bug report,0,4,0,0,3,4,1,4,4,1,8,14,4
case study,9,5,6,9,5,4,3,4,7,7,7,17,8
code,3,0,3,3,0,3,1,3,2,0,9,5,5
defect prediction,0,0,1,0,5,1,0,5,5,4,17,14,1
paper approach,1,1,2,3,3,2,1,4,2,3,1,13,3
programming language,2,2,5,2,1,9,1,2,3,2,5,8,4
software architecture,7,8,4,2,5,0,0,5,2,3,2,0,2


In [44]:
#print(cosine_similarity(b))