In [1]:
import pandas as pd

In [155]:
papers_editors_fields = pd.read_csv("Z:/capstone_data/editors_and_their_papers.csv", index_col = 'Unnamed: 0')
fields_children = pd.read_csv("Z:\capstone_data\\advanced\FieldOfStudyChildren.txt", sep="\t", header = None, names=['parent', 'child'])
fields = pd.read_csv("Z:\capstone_data\\advanced\FieldsOfStudy.txt", sep="\t", header = None, names=['parent', 'parent_level'], usecols=[0,5])

  mask |= (ar1 == a)


In [156]:
papers_editors_fields = papers_editors_fields[papers_editors_fields['Score'] < 0.5]

In [3]:
# create dataframe that associates fields by parent-child relationships and gives the parent's level (0 to 5)
def get_fields_data(fields, fields_children): #read from fieldofstudy and fieldofstudychildren respectively
    new_fields = pd.merge(fields_children, fields, on='parent', how='left')
    return new_fields

In [4]:
# create dataframe with columns child and parent_info (dict of id: level)
def map_child_to_parent_fields(fields): # takes dataframe and returns graph 

    fields['parent_info'] = list(zip(fields.parent, fields.parent_level))
    df = fields[['child', 'parent_info']].groupby('child')['parent_info'].agg(lambda x: dict((a, b) for a, b in x)).reset_index()

    return df

In [5]:
# returns a list of all zero level fields for a given field (v)
def depth_first_search(graph, v, visited):
    stack = [v]
    zero_level_fields = set()
    
    while stack:
        v = stack.pop()
        
        if v in visited:
            continue
            
        visited.add(v)

        if v in graph:
            for parent in graph[v].keys():
                if graph[v][parent] == 0:
                    zero_level_fields.add(parent)
                else:
                    stack.append(parent)
    
    return list(zero_level_fields)

In [6]:
#returns graph(dict of lists) structure with format ```field : [zero level fields]```
def map_field_to_zero_level_field(child_to_parent_fields_mapping, fields):
    
    child_parent_fields = child_to_parent_fields_mapping.to_dict('split')['data']
    
    graph = {i[0]: i[1] for i in child_parent_fields}
    
    zero_level_fields_mapping = {}
    
    for index, row in fields.iterrows():
        field = row['child']
        zero_level_fields = depth_first_search(graph, field, set())           
        zero_level_fields_mapping[field] = zero_level_fields
    
    
    return zero_level_fields_mapping

In [7]:
def collapse_to_dict(x):
    tracker = {}
    for a, b in x:
        tracker[a] = tracker.get(a, 0) + b
    return tracker

In [66]:
#returns dataframe with paper data and column of {zerolevelfield: summed score, etc.}
def map_papers_to_fields(papers, zero_level_fields):
    
    paper_fields_mapping = pd.merge(papers, zero_level_fields, on='FieldId', how='left')
        
    paper_fields_mapping['ZeroLevelFieldsScores'] = list(zip(paper_fields_mapping.ParentIds, paper_fields_mapping.Score))
    paper_fields_mapping = paper_fields_mapping.dropna(0)
    #indexing [1:-1] and splitting cleans the ids because they are treated as strings
    paper_fields_mapping['ZeroLevelFieldsScores'] = paper_fields_mapping.apply(lambda x: list((a, x.ZeroLevelFieldsScores[1]) for a in x.ZeroLevelFieldsScores[0][1:-1].split(', ')), axis=1)
    df = paper_fields_mapping.groupby('PaperId')['ZeroLevelFieldsScores'].agg(lambda x : x.values.tolist()).reset_index()
    df['ZeroLevelFieldsScores'] = df['ZeroLevelFieldsScores'].apply(lambda lst: list([x for l in lst for x in l]))
    df['ZeroLevelFieldsScores'] = df['ZeroLevelFieldsScores'].apply(collapse_to_dict)
    
    return df

In [98]:
def get_author_field(authors_papers, this_author): #using all the papers that an academic has authored, get their level 0 field
    
    #step 1: get all an author's papers
    this_author = authors_papers[authors_papers['AuthorId'] == this_author]
    
    #step 2: sum the level 0 fields and probabilities of all those papers
    scores = {}
    for index, row in this_author.iterrows():
        for field, score in row['ZeroLevelFieldsScores'].items():
            scores[field] = scores.get(field, 0) + score 

    return max([(value, key) for key, value in scores.items()])[1] if scores else -1

In [134]:
def get_journal_field(journal_papers, this_journal): #using all the papers that an academic has authored, get their level 0 field
    
    #step 1: get all a journal's papers
    this_journal_df = journal_papers[journal_papers['JournalId'] == this_journal]
    
    #step 2: sum the level 0 fields and probabilities of all those papers
    scores = {}
    for index, row in this_journal_df.iterrows():
        for field, score in row['ZeroLevelFieldsScores'].items():
            scores[field] = scores.get(field, 0) + score 

    return max([(value, key) for key, value in scores.items()])[1] if scores else -1

In [45]:
# ***uncomment if running from scratch
new_fields = get_fields_data(fields, fields_children)
mapping = map_child_to_parent_fields(new_fields)
zero_level_fields_mapping = map_field_to_zero_level_field(mapping, new_fields)
flds, prnts = zip(*zero_level_fields_mapping.items())
zero_level_df = pd.DataFrame({'FieldId': flds, 'ParentIds': prnts})

In [11]:
# zero_level_df.to_csv('./zero_level_mapping.csv')
zero_level_df = pd.read_csv('zero_level_mapping.csv')

In [157]:
papers_fields = map_papers_to_fields(papers_editors_fields, zero_level_df)

In [158]:
authors_papers = pd.merge(papers_editors_fields, papers_fields, on='PaperId' , how='inner').drop(['FieldId', 'Score'], axis=1).drop_duplicates(subset=['PaperId', 'AuthorId'])
authors_fields = pd.DataFrame(columns= ['AuthorId', 'FieldPrediction'])
for author in papers_editors_fields['AuthorId'].unique():
    field_prediction = get_author_field(authors_papers, author)
    authors_fields.loc[len(authors_fields.index)] = [int(author), field_prediction] 

In [159]:
authors_fields.to_csv('authors_fields_predictions.csv')

Unnamed: 0,AuthorId,FieldPrediction
0,2278406758,86803240
1,2014284122,15744967
2,2222372278,86803240
3,2640940786,86803240
4,2107252006,71924100
...,...,...
52767,2686191159,33923547
52768,3008005074,86803240
52769,2488279845,86803240
52770,3019503342,86803240


In [143]:
papers_journals = pd.read_csv("Z:/papers_journals.csv", index_col = 'Unnamed: 0', nrows=100000).dropna()

In [160]:
journals_papers = pd.merge(papers_journals, papers_fields, on='PaperId', how='inner').drop_duplicates(subset=['PaperId'])
journals_fields = pd.DataFrame(columns= ['JournalId', 'FieldPrediction'])
for journal in papers_journals['JournalId'].unique():
    field_prediction = get_journal_field(journals_papers, journal)
    journals_fields.loc[len(journals_fields.index)] = [str(journal), field_prediction] 

In [161]:
journals_fields.to_csv('journals_fields_predictions.csv')