# RQ1: Which technologies have been investigated in the last decade?

## Module importieren

In [1]:
import pandas as pd
import altair as alt
import numpy as np
from collections import Counter
import spacy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity
from spacy.symbols import NOUN,X,VERB,ADJ,SYM,NUM
from spacy.matcher import Matcher
from spacy.util import filter_spans

## Daten einlesen
In diesem Block werden die Daten in eine Pandas DataFrame geladen. Anschließend werden die Spalten zur besseren Übersichtlichkeit umbenannt.

In [2]:
#Datene einlesen
data = pd.read_csv('./data/data.csv',usecols = ['Unnamed: 0','Document Title','Abstract','Publication Year'])
data.columns = ['index','title','year','abstract']
data.index = data['index']

data.drop(["index"], axis = 1, inplace = True)


## NLP Model laden

In [3]:
nlp = spacy.load("en_core_web_sm")

## Matcher initialisieren

In [4]:
matcher = Matcher(nlp.vocab)

## Pattern festlegen und dem Matcher hinzufügen

In [5]:
pattern_noun_propn = [{'POS': 'PROPN', 'OP': '?'},
                      {'POS': 'NOUN'},
                      {'POS': 'PROPN', 'OP': '?'}]

In [6]:
pattern_nouns = [{'POS': 'NOUN'},
                 {'POS': 'NOUN'}]

In [7]:
pattern_propn = [{'POS': 'PROPN'},
                 {'POS': 'PROPN'}]

In [8]:
pattern_adj_noun = [{'POS': 'ADJ'},
                    {'POS': 'NOUN'}]

In [9]:
pattern_verb_verb = [{'POS': 'VERB'},
                    {'POS': 'VERB'}]

In [10]:
#pattern_test = [{'POS': 'ADJ', 'OP': '?'},
#                {'POS': 'NOUN'},
#                {'POS': 'NOUN'}]

pattern_exmaple = [{'POS': 'ADJ', 'OP': '?'},
                   {'OP': '+', 'POS': 'NOUN'},
                   {'POS': 'NOUN', 'OP': '?'}]

In [11]:
matcher.add("match_propn", [pattern_propn])
matcher.add("match_noun_propn", [pattern_noun_propn])
matcher.add("match_nouns", [pattern_nouns])
matcher.add("match_adj_noun", [pattern_adj_noun])
matcher.add("match_verb_verb", [pattern_verb_verb])

## Funktion für die TermDokumentenMatrix

In [12]:
def CreateTermDocMatrix(column):
        
    #stop_words='english',ngram_range = (1,2),preprocessor = my_preprocessor, 
    count_vectorizer = CountVectorizer(preprocessor = my_preprocessor,  tokenizer = Retokenize,dtype=np.int32)
    sparse_matrix = count_vectorizer.fit_transform(data[column].values.astype('U'))

    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names(), dtype=np.int32)
    df['year'] = data['year']
    returndf = df.groupby('year').sum().transpose().astype('int32')
    return returndf

In [13]:
def ShowWord(data,word):
    d = data.loc[word]
    d.plot.bar()

In [14]:
def Retokenize(text):
    doc = nlp(text)

    matches = matcher(doc)
    sp  = [doc[match[1]:match[2]] for match in matches] 

    filtered = filter_spans(sp)
    with doc.retokenize() as retokenizer:
        for s in filtered:
            retokenizer.merge(s, attrs = {"POS": "NOUN"})
    return [(token.text,token.pos_) for token in doc]

In [15]:
def my_postprocessor(token):
    
    if not token.is_digit:
        if not token.is_punct:
            if not token.like_num:
                if not token.is_stop:
                    if not token.is_space:
                        if token.pos != X:
                            if token.pos != VERB:
                                if token.pos != ADJ:
                                    if token.pos != SYM:
                                        if token.pos != NUM:
                                            return token
    return result
        

In [16]:
def my_preprocessor(text):
    result = ''
    doc = nlp(str(text))
    
    for token in doc:
        if not token.is_digit:
            if not token.is_punct:
                if not token.like_num:
                    if not token.is_stop:
                        if not token.is_space:
                            if token.pos != X:
                                if token.pos != VERB:
                                    if token.pos != ADJ:
                                        if token.pos != SYM:
                                            if token.pos != NUM:
                                                result += token.lemma_
        if len(result) != 0:
            if result[-1] != ' ':
                result += ' '
    return result

## Term Dokumenten Matrix erstellen

In [17]:
abstract = CreateTermDocMatrix('abstract')

In [18]:
tfidf =TfidfTransformer()

In [19]:
tfidf.fit_transform(abstract.transpose())

<13x37992 sparse matrix of type '<class 'numpy.float64'>'
	with 50409 stored elements in Compressed Sparse Row format>

In [20]:
tfidf.idf_

array([2.54044504, 2.94591015, 2.94591015, ..., 2.94591015, 2.94591015,
       2.94591015])

## Top 5 Themen

In [21]:
stopwords = ['set', 'system','metric','result']

In [22]:
anzahl_top_topics = 20

In [23]:
data_words = abstract.drop(stopwords, axis = 0)
#data_words = abstract

In [24]:
for year in range(2009, 2022,1):
    d = data_words[year].nlargest(anzahl_top_topics)
    print(d)
    print('--------------------------------------------')
    



   


(software development, NOUN)    24
(software engineering, NOUN)    18
(result, VERB)                  14
(software system, NOUN)         14
(system, NOUN)                  11
(directly, ADV)                  9
(respectively, ADV)              8
(set, VERB)                      8
(IEEE Software, NOUN)            7
(case study, NOUN)               7
(defect, VERB)                   7
(engineering, NOUN)              7
(generally, ADV)                 7
(process, NOUN)                  7
(software, NOUN)                 7
(automatically, ADV)             6
(test suite, NOUN)               6
(ERP implementation, NOUN)       5
(article, NOUN)                  5
(carefully, ADV)                 5
Name: 2009, dtype: int32
--------------------------------------------
(set, VERB)                      23
(automatically, ADV)             16
(result, VERB)                   14
(product line, NOUN)             12
(software development, NOUN)     12
(software system, NOUN)          10
(software engi

## Balkendiagramme erstellen

### Balkendiagramm 2009

In [25]:
year = 2009
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2010

In [26]:
year = 2010
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

## Balkendiagramm 2011

In [27]:
year = 2011
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2012

In [28]:
year = 2012
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2013

In [29]:
year = 2013
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2014

In [30]:
year = 2014
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2015

In [31]:
year = 2015
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2016

In [32]:
year = 2016
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2017

In [33]:
year = 2017
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2018

In [34]:
year = 2018
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2019

In [35]:
year = 2019
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2020

In [36]:
year = 2020
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        x='Technologies:O',
        y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

### Balkendiagramm 2021

In [37]:
year = 2021
d = data_words[year].nlargest(anzahl_top_topics).sort_values(ascending=False)
source = pd.DataFrame({'Technologies': d.index,
                            'Anzahl': d})
alt.Chart(source).mark_bar().encode(
        #x='Technologies:O',
        alt.X('Technologies:O'),
        alt.Y('Anzahl:Q', sort='x')
        #y='Anzahl:Q'
    ).properties(
    title=f'Technologien {year}')

In [38]:
#for columns in data.itertuples():
#    text = columns[3] #1 = title 2 = year 3 = abstract
    # Verarbeite den Text
#    if type(text) is str:
#            doc = nlp(text)
    
#    matches = matcher(doc)
#    sp  = [doc[match[1]:match[2]] for match in matches]
#    filtered = filter_spans(sp)
#    with doc.retokenize() as retokenizer:
#        for s in filtered:
#            retokenizer.merge(s, attrs = {"POS": "NOUN"})
#    [(token.pos_, token.text) for token in doc]

#    for token in doc:
        # Greife auf den Text, die Wortart und die Dependenzrelation des Tokens zu
#        token_text = token.text
#        token_pos = token.pos_
#        token_dep = token.dep_
#        token_ent = token.ent_type_

#        if token.pos == NOUN:
#            if not token.is_stop:
#                words[token_text] += 1
                                          

## Wörter darstellen

In [39]:
print(type(abstract.sum(axis = 0)))

<class 'pandas.core.series.Series'>


## Liste erstellen

In [40]:
#words = abstract.sum(axis= 1).nlargest(20)

In [41]:
#b = abstract[abstract.index.isin(words.index)] 

In [42]:
#b

In [43]:
#print(cosine_similarity(b))