In [1]:
import pandas as pd
import numpy as np
from collections import Counter
import spacy
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv('./data/PoPCites2.csv')

In [3]:
#Funktionen
def Count(data,count):
    cnt = Counter()
    for item in data: 
        cnt[item] += 1
    return cnt.most_common(count)

def CountCites(data,count):
    
    i = 0
    cnt = Counter()
    for item in data['Cites']: 
        cnt[item] += 1
    return cnt.most_common(count)

def CountWords(data,count):
    cnt = Counter()
    #Pandas.Series durchlaufen um die Title zu bekommen
    for index, value in data.items():
        words = value.lower().split(' ')
        #Title in Wörter teilen
        for word in words:
            #Aussortieren der Stopwords
            if not word in stopwords:
                #Für bestimmte Begriffe
                #for word in wordlist:
                cnt[word] += 1
    return cnt.most_common(count)

def CountToTfidf(counts):
    transformer = TfidfTransformer(smooth_idf=False)
    
    #Gewichtet die CountMatrix
    tfidf = transformer.fit_transform(counts)

    #Gibt Array zurück
    return tfidf.toarray()

def ListToDataFrame(l):
    ld = pd.DataFrame.from_dict(l)
    ld = ld.rename(columns={ 0:'word',1:'data'})
    ld = ld.set_index('word')
    return ld


def ListToDataArray(l,year):
    ld = pd.DataFrame.from_dict(l)
    ld = ld.rename(columns={ 0:'word',1:year})
    ld = ld.set_index('word')
    #Gibt Pandas.DataFrame zurück
    return ld


def TfidfVector(data):
    vectorizer = TfidfVectorizer()
    X = vectorizer.fit_transform(data)
    print(vectorizer.get_feature_names())
    print(X.toarray())
    
def CountVector(data):
    vectorizer = CountVectorizer()
    X = vectorizer.fit_transform(data)
    print(vectorizer.get_feature_names())
    return X.toarray()

def FilteredbyPublisher(data,publisher):
    databypublisher = data.loc[data['Publisher'] == publisher]
    return databypublisher

def FilteredByYear(data,year):
    databyyear = data.loc[data['Year'] == year]
    return databyyear
    
def TitleFilteredByYear(data,year):
    databyyear = data['Title'].loc[data['Year'] == year]
    return databyyear

def CountWordsByYear(data,year):
    #d = Pandas.Series
    d = FilteredByYear(data,year)
    l = CountWords(d['Title'],30)
    #Gibt List zurück
    return l

def CountWordsInAbstractByYear(data,year):
    d = FilteredByYear(data,year)
    l = CountWords(d['Abstract'],30)
    return l
    
def MergeDataFrames(data,anzahlJahre):
    year = datetime.datetime.now()
    year = year.year
    da = pd.DataFrame()
    for y in range(year - anzahlJahre, year):
        l = CountWordsByYear(data,y)
        p = ListToDataArray(l,y)
        if(da.empty):
            da = p
        else:
            da = da.merge(p, on= 'word' ,how='inner')
    return da       
    
def CosineSimilarity(data):
    return cosine_similarity(data,data)

def CreateTermDocMatrix(data):
    documents = []

    for text in data['Abstract']:
        documents.append(text)


    count_vectorizer = CountVectorizer(stop_words='english')
    #count_vectorizer = CountVectorizer()
    sparse_matrix = count_vectorizer.fit_transform(documents)

    doc_term_matrix = sparse_matrix.todense()
    df = pd.DataFrame(doc_term_matrix, columns=count_vectorizer.get_feature_names())
    df['PublishingYear'] = data['Year']
    returndf = df.groupby('PublishingYear').sum()
    return returndf


In [4]:
#Variablen erstellen
title = data['Title']
publisher = data['Publisher']
year = data['Year']

wordlist = [
    'DevOps',
    'engineering',
    'Software',
    'Engineering',
    'software'
    
]
stopwords =[
    'this',
    'is',
    'none',
    'in',
    'of',
    'A',
    'a',
    'and',
    'for',
    'with',
    'on',
    'the',
    'to',
    'an',
    'be',
    'as',
    'has',
    'are',
    'we',
    'been',
    'by',
    'many',
    'that',
    'at',
    '...',
    'more',
    'them',
    'have',
    'such'
]


In [5]:
abstract = CreateDocTermMatrix(data['Abstract'])
l = CosineSimilarity(abstract)
l



array([[1.        , 0.21710791, 0.03618465, ..., 0.12021658, 0.15456232,
        0.12255111],
       [0.21710791, 1.        , 0.12307692, ..., 0.10222489, 0.11500161,
        0.08684168],
       [0.03618465, 0.12307692, 1.        , ..., 0.01703748, 0.0328576 ,
        0.01736834],
       ...,
       [0.12021658, 0.10222489, 0.01703748, ..., 1.        , 0.12735696,
        0.13464028],
       [0.15456232, 0.11500161, 0.0328576 , ..., 0.12735696, 1.        ,
        0.07418865],
       [0.12255111, 0.08684168, 0.01736834, ..., 0.13464028, 0.07418865,
        1.        ]])

In [6]:
databyyear = data.groupby(['Year','Title'])

In [7]:
#Hier arbeiten


In [8]:
title = CreateDocTermMatrix(data['Title'])

In [9]:
#data.info()

import en_core_web_md
nlp = en_core_web_md.load()
#nlp = spacy.load("en_core_web_md")

doc = nlp("Tim and Tom")
token1, token2 = doc[0], doc[2]

# Berechne die Ähnlichkeit der Tokens "TV" und "books"
similarity = token1.similarity(token2)
print(similarity)

0.6667456


In [10]:
# Load English tokenizer, tagger, parser and NER
#nlp = spacy.load("en_core_web_sm")

# Process whole documents
#doc = nlp(text)

# Analyze syntax
#print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
#print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])

# Find named entities, phrases and concepts
#for entity in doc.ents:
##print(entity.text, entity.label_)

In [11]:


token_text = [token.text for token in doc]
>>> print(token_text)
>>> token = [token for token in doc]
>>> print(token)
d

In [12]:
#data2018 = TitleFilteredByYear(data,2018)
#data['Cites'].plot.bar()

In [13]:
#TfidfVector(data)


In [14]:
#data2018