In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import spacy
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv('./data/PoPCites2.csv')
#data = pd.read_csv('data/PoPCites2.csv',usecols = '')

# RQ1: Which technologies have been investigated in the last decade?

# RQ2: In which phase of the technology life cycle path are the technologies?
# - Interest in topic: #Papers on a topic over time
# - Parallel dazu: Interest in topic: Number of citations to papers on a topic over time
#- Publikationstyp: Workshop -> Conference -> Journal 

# RQ3: How stable is the community working on the topics (new authors emerging, authors staying on for the whole time,
# or authors “leaving” the area)?

#data.info()
#data.head(20)

In [3]:
#Funktionen
def Count(data,count):
    cnt = Counter()
    for item in data: 
        cnt[item] += 1
    return cnt.most_common(count)

def CountCites(data,count):
    cnt = Counter()
    for item in data: 
        cnt[item] += 1
    return cnt.most_common(count)

def CountWords(data,count):
    cnt = Counter()
    #Pandas.Series durchlaufen um die Title zu bekommen
    for index, value in data.items():
        words = value.lower().split(' ')
        #Title in Wörter teilen
        for word in words:
            #Aussortieren der Stopwords
            if not word in stopwords:
                #Für bestimmte Begriffe
                #for word in wordlist:
                cnt[word] += 1
    return cnt.most_common(count)

def CountToTfidf(counts):
    transformer = TfidfTransformer(smooth_idf=False)
    
    #Gewichtet die CountMatrix
    tfidf = transformer.fit_transform(counts)

    #Gibt Array zurück
    return tfidf.toarray()

def ListToDataFrame(l):
    ld = pd.DataFrame.from_dict(l)
    ld = ld.rename(columns={ 0:'word',1:'data'})
    ld = ld.set_index('word')
    return ld


def ListToDataArray(l,year):
    ld = pd.DataFrame.from_dict(l)
    ld = ld.rename(columns={ 0:'word',1:year})
    ld = ld.set_index('word')
    #Gibt Pandas.DataFrame zurück
    return ld


def TfidfVector(data):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data)
    print(vectorizer.get_feature_names())
    print(X.toarray())
    
def CountVector(data):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    print(vectorizer.get_feature_names())
    return X.toarray()

def FilteredByYear(data,year):
    databyyear = data.loc[data['Year'] == year]
    return databyyear
    
def TitleFilteredByYear(data,year):
    databyyear = data['Title'].loc[data['Year'] == year]
    return databyyear

def CountWordsByYear(data,year):
    #d = Pandas.Series
    d = FilteredByYear(data,year)
    l = CountWords(d['Title'],30)
    #Gibt List zurück
    return l

def CountWordsInAbstractByYear(data,year):
    d = FilteredByYear(data,year)
    l = CountWords(d['Abstract'],30)
    return l
    
def MergeDataFrames(data,anzahlJahre):
    year = datetime.datetime.now()
    year = year.year
    da = pd.DataFrame()
    for y in range(year - anzahlJahre, year):
        l = CountWordsByYear(data,y)
        p = ListToDataArray(l,y)
        if(da.empty):
            da = p
        else:
            da = da.merge(p, on= 'word' ,how='inner')
    return da       
    
def CosineSimilarity(data):
    return cosine_similarity(data,data)

def CreateDocTermMatrix(data):
    documents = []

    for text in data:
        documents.append(text)


    count_vectorizer = CountVectorizer(stop_words='english')
    #count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(documents)
    print(sparse_matrix)

    doc_term_matrix = sparse_matrix.todense()
    return pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names())


In [4]:
#Variablen erstellen
title = data['Title']
publisher = data['Publisher']
year = data['Year']

wordlist = [
    'DevOps',
    'engineering',
    'Software',
    'Engineering',
    'software'
    
]
stopwords =[
    'this',
    'is',
    'none',
    'in',
    'of',
    'A',
    'a',
    'and',
    'for',
    'with',
    'on',
    'the',
    'to',
    'an',
    'be',
    'as',
    'has',
    'are',
    'we',
    'been',
    'by',
    'many',
    'that',
    'at',
    '...',
    'more',
    'them',
    'have',
    'such'
]


In [5]:
#l1 = CountWordsByYear(data,2017)
#l2 = CountWordsByYear(data,2018)
#l3 = CountWordsByYear(data,2019)
#l4 = CountWordsByYear(data,2020)
#l4 = ListToDataArray(l4,2020)
l = CountWordsInAbstractByYear(data,2019)
#d = CalculateSimilarity(l)

In [6]:
l = CountWordsInAbstractByYear(data,2019)
df = ListToDataFrame(l)
#df
#df.plot.bar()

In [7]:
#Hier arbeiten

abstract = CreateDocTermMatrix(data['Abstract'])
print(abstract)
print(CosineSimilarity(abstract))

TypeError: strip arg must be None or str

In [None]:
title = CreateDocTermMatrix(data['Title'])
title.columns

In [None]:
#data.info()

In [None]:
# Load English tokenizer, tagger, parser and NER
#nlp = spacy.load("en_core_web_sm")

# Process whole documents
#doc = nlp(text)

# Analyze syntax
#print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
#print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
#for entity in doc.ents:
##print(entity.text, entity.label_)

In [None]:

#Dokumentenähnlichkeit x Graphanalyse

#Wachstum darstellen


In [None]:
#data2018 = TitleFilteredByYear(data,2018)
#data['Cites'].plot.bar()

In [None]:
#TfidfVector(data)


In [None]:
#data2018