In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import spacy
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv('./data/PoPCites2.csv')
#data = pd.read_csv('data/PoPCites2.csv',usecols = '')

# RQ1: Which technologies have been investigated in the last decade?

# RQ2: In which phase of the technology life cycle path are the technologies?
# - Interest in topic: #Papers on a topic over time
# - Parallel dazu: Interest in topic: Number of citations to papers on a topic over time
#- Publikationstyp: Workshop -> Conference -> Journal 

# RQ3: How stable is the community working on the topics (new authors emerging, authors staying on for the whole time,
# or authors “leaving” the area)?

#data.info()
data.head(20)

Unnamed: 0,Cites,Authors,Title,Year,Source,Publisher,ArticleURL,CitesURL,GSRank,QueryDate,...,Volume,Issue,StartPage,EndPage,ECC,CitesPerYear,CitesPerAuthor,AuthorCount,Age,Abstract
0,435,R Mall,Fundamentals of software engineering,2018,,books.google.com,https://books.google.com/books?hl=en&lr=&id=-J...,https://scholar.google.com/scholar?cites=22762...,1,2021-04-19 20:05:26,...,,,,,435,145.0,435,1,3,"This new edition of the book, is restructured ..."
1,1226,"M Brambilla, J Cabot, M Wimmer",Model-driven software engineering in practice,2017,… on software engineering,morganclaypool.com,https://www.morganclaypool.com/doi/abs/10.2200...,https://scholar.google.com/scholar?cites=47960...,2,2021-04-19 20:05:26,...,,,,,1226,306.5,409,3,4,This book discusses how model-based approaches...
2,118,"C Page, HW Six",Software Engineering,2017,Larsen and Keller Education,repo.mimitlibrary.ac.in,http://repo.mimitlibrary.ac.in:8080/jspui/bits...,https://scholar.google.com/scholar?cites=15367...,3,2021-04-19 20:05:26,...,,,,,118,29.5,59,2,4,1. a) What are drawbacks of Spiral Model? b) W...
3,22,G Booch,The history of software engineering,2018,IEEE Software,ieeexplore.ieee.org,https://ieeexplore.ieee.org/abstract/document/...,https://scholar.google.com/scholar?cites=16004...,4,2021-04-19 20:05:26,...,,,,,22,7.33,22,1,3,"Grady Booch, one of UML's original authors, of..."
4,142,"B Kitchenham, L Madeyski, D Budgen, J Keung…",Robust statistical methods for empirical softw...,2017,… Software Engineering,Springer,https://link.springer.com/article/10.1007/s106...,https://scholar.google.com/scholar?cites=16050...,5,2021-04-19 20:05:26,...,,,,,142,35.5,28,5,4,There have been many changes in statistical th...
5,179,"S Amershi, A Begel, C Bird, R DeLine…",Software engineering for machine learning: A c...,2019,… Engineering …,ieeexplore.ieee.org,https://ieeexplore.ieee.org/abstract/document/...,https://scholar.google.com/scholar?cites=11063...,6,2021-04-19 20:05:26,...,,,,,179,89.5,36,5,2,Recent advances in machine learning have stimu...
6,59,RJ Leach,Introduction to software engineering,2018,,books.google.com,https://books.google.com/books?hl=en&lr=&id=qk...,https://scholar.google.com/scholar?cites=11520...,7,2021-04-19 20:05:26,...,,,,,59,19.67,59,1,3,Practical Guidance on the Efficient Developmen...
7,48,"KJ Stol, B Fitzgerald",The ABC of software engineering research,2018,ACM Transactions on Software Engineering and …,dl.acm.org,https://dl.acm.org/doi/abs/10.1145/3241743,https://scholar.google.com/scholar?cites=81193...,8,2021-04-19 20:05:26,...,,,,,48,16.0,24,2,3,A variety of research methods and techniques a...
8,59,"A Arpteg, B Brinne, L Crnkovic-Friis…",Software engineering challenges of deep learning,2018,… Software Engineering …,ieeexplore.ieee.org,https://ieeexplore.ieee.org/abstract/document/...,https://scholar.google.com/scholar?cites=34964...,9,2021-04-19 20:05:26,...,,,,,59,19.67,15,4,3,Surprisingly promising results have been achie...
9,254,"K Mao, L Capra, M Harman, Y Jia",A survey of the use of crowdsourcing in softwa...,2017,Journal of Systems and Software,Elsevier,https://www.sciencedirect.com/science/article/...,https://scholar.google.com/scholar?cites=49112...,10,2021-04-19 20:05:26,...,,,,,254,63.5,64,4,4,The term 'crowdsourcing'was initially introduc...


In [3]:
#Funktionen
def Count(data,count):
    cnt = Counter()
    for item in data: 
        cnt[item] += 1
    return cnt.most_common(count)

def CountCites(data,count):
    cnt = Counter()
    for item in data: 
        cnt[item] += 1
    return cnt.most_common(count)

def CountWords(data,count):
    cnt = Counter()
    #Pandas.Series durchlaufen um die Title zu bekommen
    for index, value in data.items():
        words = value.lower().split(' ')
        #Title in Wörter teilen
        for word in words:
            #Aussortieren der Stopwords
            if not word in stopwords:
                #Für bestimmte Begriffe
                #for word in wordlist:
                cnt[word] += 1
    return cnt.most_common(count)

def CountToTfidf(counts):
    transformer = TfidfTransformer(smooth_idf=False)
    
    #Gewichtet die CountMatrix
    tfidf = transformer.fit_transform(counts)

    #Gibt Array zurück
    return tfidf.toarray()

def ListToDataFrame(l):
    ld = pd.DataFrame.from_dict(l)
    ld = ld.rename(columns={ 0:'word',1:'data'})
    ld = ld.set_index('word')
    return ld


def ListToDataArray(l,year):
    ld = pd.DataFrame.from_dict(l)
    ld = ld.rename(columns={ 0:'word',1:year})
    ld = ld.set_index('word')
    #Gibt Pandas.DataFrame zurück
    return ld


def TfidfVector(data):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data)
    print(vectorizer.get_feature_names())
    print(X.toarray())
    
def CountVector(data):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    print(vectorizer.get_feature_names())
    return X.toarray()

def FilteredByYear(data,year):
    databyyear = data.loc[data['Year'] == year]
    return databyyear
    
def TitleFilteredByYear(data,year):
    databyyear = data['Title'].loc[data['Year'] == year]
    return databyyear

def CountWordsByYear(data,year):
    #d = Pandas.Series
    d = FilteredByYear(data,year)
    l = CountWords(d['Title'],30)
    #Gibt List zurück
    return l

def CountWordsInAbstractByYear(data,year):
    d = FilteredByYear(data,year)
    l = CountWords(d['Abstract'],30)
    return l
    
def MergeDataFrames(data,anzahlJahre):
    year = datetime.datetime.now()
    year = year.year
    da = pd.DataFrame()
    for y in range(year - anzahlJahre, year):
        l = CountWordsByYear(data,y)
        p = ListToDataArray(l,y)
        if(da.empty):
            da = p
        else:
            da = da.merge(p, on= 'word' ,how='inner')
    return da       
    
def CosineSimilarity(data):
    return cosine_similarity(data,data)

def CreateDocTermMatrix(data):
    documents = []

    for text in data:
        documents.append(text)


    count_vectorizer = CountVectorizer(stop_words='english')
    #count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(documents)
    print(sparse_matrix)

    doc_term_matrix = sparse_matrix.todense()
    return pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names())


In [4]:
#Variablen erstellen
title = data['Title']
publisher = data['Publisher']
year = data['Year']

wordlist = [
    'DevOps',
    'engineering',
    'Software',
    'Engineering',
    'software'
    
]
stopwords =[
    'this',
    'is',
    'none',
    'in',
    'of',
    'A',
    'a',
    'and',
    'for',
    'with',
    'on',
    'the',
    'to',
    'an',
    'be',
    'as',
    'has',
    'are',
    'we',
    'been',
    'by',
    'many',
    'that',
    'at',
    '...',
    'more',
    'them',
    'have',
    'such'
]


In [5]:
#l1 = CountWordsByYear(data,2017)
#l2 = CountWordsByYear(data,2018)
#l3 = CountWordsByYear(data,2019)
#l4 = CountWordsByYear(data,2020)
#l4 = ListToDataArray(l4,2020)
l = CountWordsInAbstractByYear(data,2019)
#d = CalculateSimilarity(l)

In [6]:
l = CountWordsInAbstractByYear(data,2019)
df = ListToDataFrame(l)
#df
#df.plot.bar()

In [7]:
#Hier arbeiten

abstract = CreateDocTermMatrix(data['Abstract'])
print(abstract)
print(CosineSimilarity(abstract))

  (0, 1023)	1
  (0, 485)	1
  (0, 186)	1
  (0, 1301)	1
  (0, 1558)	1
  (0, 70)	1
  (0, 876)	1
  (0, 43)	1
  (0, 1417)	2
  (0, 523)	2
  (0, 1533)	1
  (0, 770)	1
  (0, 888)	1
  (0, 526)	1
  (0, 1524)	2
  (0, 1147)	1
  (0, 1386)	1
  (1, 186)	1
  (1, 1417)	2
  (1, 523)	2
  (1, 1147)	1
  (1, 455)	1
  (1, 985)	3
  (1, 163)	1
  (1, 119)	1
  :	:
  (198, 188)	1
  (198, 1151)	1
  (198, 1496)	1
  (198, 83)	1
  (199, 1417)	1
  (199, 523)	1
  (199, 970)	1
  (199, 926)	2
  (199, 899)	2
  (199, 1532)	1
  (199, 1465)	1
  (199, 115)	1
  (199, 385)	1
  (199, 988)	1
  (199, 982)	1
  (199, 196)	1
  (199, 1477)	1
  (199, 450)	1
  (199, 127)	1
  (199, 1230)	1
  (199, 598)	1
  (199, 150)	1
  (199, 783)	1
  (199, 169)	1
  (199, 635)	1
     12  14  15  17  1951  1960s  1968  1970s  1989  1990  ...  workers  \
0     0   0   0   0     0      0     0      0     0     0  ...        0   
1     0   0   0   0     0      0     0      0     0     0  ...        0   
2     0   0   0   0     0      0     0      0     0    

In [8]:
title = CreateDocTermMatrix(data['Title'])
title.columns

  (0, 181)	1
  (0, 417)	1
  (0, 151)	1
  (1, 417)	1
  (1, 151)	1
  (1, 285)	1
  (1, 135)	1
  (1, 329)	1
  (2, 417)	1
  (2, 151)	1
  (3, 417)	1
  (3, 151)	1
  (3, 212)	1
  (4, 417)	1
  (4, 151)	1
  (4, 388)	1
  (4, 425)	1
  (4, 275)	1
  (4, 148)	1
  (5, 417)	1
  (5, 151)	1
  (5, 261)	1
  (5, 256)	1
  (5, 66)	1
  (5, 434)	1
  :	:
  (196, 108)	1
  (196, 121)	1
  (196, 228)	1
  (196, 37)	1
  (197, 417)	1
  (197, 151)	1
  (197, 256)	1
  (197, 101)	1
  (197, 51)	1
  (197, 165)	1
  (197, 238)	1
  (198, 417)	1
  (198, 151)	1
  (198, 101)	1
  (198, 175)	1
  (198, 199)	1
  (198, 258)	1
  (198, 179)	1
  (199, 417)	1
  (199, 151)	1
  (199, 275)	1
  (199, 261)	1
  (199, 256)	1
  (199, 287)	1
  (199, 33)	1


Index(['10', '1000', '101', '12', '29110', '50', '88', 'abc', 'abstractions',
       'academia',
       ...
       'vs', 'vulnerabilities', 'waterfall', 'ways', 'word', 'wordnet', 'work',
       'writing', 'wrong', 'years'],
      dtype='object', length=512)

In [9]:
#data.info()

import en_core_web_md
nlp = en_core_web_md.load()
#nlp = spacy.load("en_core_web_md")

doc = nlp("Tim and Tom")
token1, token2 = doc[0], doc[2]

# Berechne die Ähnlichkeit der Tokens "TV" und "books"
similarity = token1.similarity(token2)
print(similarity)

0.6667456


In [10]:
# Load English tokenizer, tagger, parser and NER
#nlp = spacy.load("en_core_web_sm")

# Process whole documents
#doc = nlp(text)

# Analyze syntax
#print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
#print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
#for entity in doc.ents:
##print(entity.text, entity.label_)

In [11]:

#Dokumentenähnlichkeit x Graphanalyse

#Wachstum darstellen


In [12]:
#data2018 = TitleFilteredByYear(data,2018)
#data['Cites'].plot.bar()

In [13]:
#TfidfVector(data)


In [14]:
#data2018