In [1]:
import networkx as nx
import numpy as np
import pandas as pd
import itertools

import matplotlib.pyplot as plt

In [2]:
cat_path  = "cat.csv"
data_path = "data.csv"

all_cat = open(cat_path, 'r').read().split(",")
all_cat.sort()
# Category dictionary
cat_dict = {cat:index for (index, cat) in enumerate(all_cat)}

df = pd.read_csv(data_path)

# delete the NAs
df.dropna(inplace = True)

# delete the duplicated paper
df.drop_duplicates(subset = "paper ID", inplace = True)

In [3]:
df.shape

(124149, 4)

In [4]:
# df.replace('ao-sci', "physics", inplace=True)

In [5]:
# Collecting all the aurhors
au_lst = []
for paper_authors in df["Authors"].values:
    for author in paper_authors.split(";"):
        au_lst.append(author)
        
# Get all the unique authors       
au_lst = list(set(au_lst))
au_lst.sort()

au_dict = {author:index for (index, author) in enumerate(au_lst)}

In [6]:
# Creating the matrix
n = len(au_dict)
p = len(all_cat)
credit_matrix = np.zeros((n, p))

for index, row in df[["Authors", "Category"]].iterrows():
    author_list = row["Authors"].split(";")
    contribute = 1.0/len(au_lst)
    for author in author_list:
        try:
            credit_matrix[ au_dict[author], cat_dict[row["Category"]] ] += contribute
        except KeyError as e:
            print(e)

# Calculating stuff
author_activity = credit_matrix / credit_matrix.sum(axis=1, keepdims=True)
author_weight_in_field = credit_matrix / credit_matrix.sum(axis=0, keepdims=True)
field_field_influence = np.transpose(author_activity).dot(author_weight_in_field)

proj1_df = pd.DataFrame(field_field_influence, columns = all_cat, index=all_cat)
proj1_df

Unnamed: 0,astro-ph,cond-mat,cs,econ,eess,gr-qc,hep-ex,hep-lat,hep-ph,hep-th,math,math-ph,nlin,nucl-ex,nucl-th,physics,q-bio,q-fin,quant-ph,stat
astro-ph,0.922227,0.000334,0.00031,0.0,4.8e-05,0.009814,0.001123,0.000779,0.007703,0.000611,0.000188,0.000467,0.000437,0.000445,0.005739,0.002182,0.000338,0.000129,0.001066,0.000538
cond-mat,0.00248,0.872934,0.002193,0.0,0.001347,0.00789,0.00383,0.017261,0.00664,0.015129,0.00157,0.021968,0.034228,0.003886,0.015409,0.022476,0.024155,0.023694,0.029574,0.001948
cs,0.007813,0.007441,0.952739,0.045347,0.103788,0.003673,0.005388,0.007876,0.007712,0.004277,0.023898,0.006519,0.020679,0.00398,0.002076,0.015015,0.023982,0.02379,0.013309,0.051101
econ,0.0,0.0,4.6e-05,0.808202,0.0,0.0,0.0,0.0,0.0,0.0,6.9e-05,0.0,0.0,0.0,0.0,0.0,0.0,0.00022,0.0,0.000715
eess,1.2e-05,4.6e-05,0.001055,0.0,0.840865,0.000267,0.000153,0.0,0.0,0.0,0.000242,0.0,0.000265,0.0,0.000393,0.00019,0.000244,0.000179,0.0,0.001109
gr-qc,0.00498,0.000539,7.4e-05,0.0,0.000529,0.874734,7.7e-05,0.000601,0.002412,0.022956,0.000295,0.007617,0.00103,0.000125,0.000236,0.000884,0.000308,0.000345,0.011129,1.5e-05
hep-ex,0.000994,0.000456,0.000189,0.0,0.000529,0.000134,0.881312,0.000878,0.003251,0.000124,9.4e-05,0.000793,4.3e-05,0.01261,0.001105,0.003013,3.6e-05,6.2e-05,0.000226,4.4e-05
hep-lat,0.000614,0.001834,0.000247,0.0,0.0,0.000935,0.000783,0.926173,0.010682,0.007853,0.000145,0.002422,0.000497,0.000585,0.001983,0.000568,0.00065,0.000325,0.001722,0.000256
hep-ph,0.00466,0.000541,0.000185,0.0,0.0,0.002876,0.002223,0.00819,0.88808,0.020853,0.000167,0.001658,0.000724,0.00055,0.007976,0.000882,0.000165,0.000349,0.001859,0.0
hep-th,0.000334,0.001114,9.3e-05,0.0,0.0,0.024746,7.7e-05,0.005444,0.018854,0.87413,0.000588,0.003849,0.001372,0.000308,0.0,0.000447,0.00034,0.000305,0.004498,0.000133


# Project 2

In [7]:
from setup import *

In [8]:
# # Declaring variables
# num_field = len(all_cat)
# num_author = len(au_dict)
# num_steps = 2


# current_credit = credit_matrix.copy()

# # Funding money
# d = np.random.rand(num_field)
# current_field_funding = d / d.sum()

# def update_author_funding(credit, field_funding):
#     author_weight_in_field = credit / credit.sum(axis=0,keepdims=True)
#     author_funding_from_field = author_weight_in_field * field_funding
#     author_funding = author_funding_from_field.sum(axis=1,keepdims=True)
#     return author_funding

# def compute_credit(author_funding):
#     new_credit = author_prod * author_funding
#     field_credit = new_credit.sum(axis=0)
#     author_credit = new_credit.sum(axis=1)
#     total_credit = new_credit.sum()
#     return new_credit, total_credit
    
# current_author_funding = update_author_funding(current_credit, current_field_funding)
# author_prod = current_credit / current_author_funding
# current_credit, current_total_credit = compute_credit(current_author_funding)

# for i in range(num_steps):
#     # Pick a random "direction" to move funding levels and computes effect on credit.
#     v = np.random.rand(num_field)
#     v -= v.mean()  # to make sure funding levels sum to 1
#     x = current_field_funding
#     # Makes sure we don't set any funding less than 0 or more than 1
#     k0 = ((np.zeros_like(x) - x) / v).max()
#     k1 = (( np.ones_like(x) - x) / v).max()
#     k = min(k0,k1)*0.05

#     new_field_funding = current_field_funding + k*v
#     new_author_funding = update_author_funding(current_credit, new_field_funding)
#     new_credit, new_total_credit = compute_credit(new_author_funding)
#     best_field_fund, best_credit, best_total_credit = current_field_funding, current_credit, current_total_credit
    
#     current_field_funding, current_credit, current_total_credit = new_field_funding, new_credit, new_total_credit
#     if(best_total_credit < current_total_credit):
#         best_field_funding, best_credit, best_total_credit = current_field_funding, current_credit, current_total_credit
#     print("Current Field Funding")
#     print(current_field_funding)
#     print("Current Credit")
#     display(margins(current_credit))
# print("Best Field Funding")
# print(best_field_funding)
# print("Best Credit")
# display(margins(best_credit))

# Project 3
Text Classifier

In [9]:
df_proj3 = df[["Title", "Category"]].copy()

# removing the new line character
for index, val in enumerate(df_proj3["Title"]):
    if "\n" in val:
        df_proj3["Title"].iloc[index] = val.replace("\n","")

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

model = make_pipeline(TfidfVectorizer(), MultinomialNB())
model2 = make_pipeline(TfidfVectorizer(), LogisticRegression())
model3 = make_pipeline(TfidfVectorizer(), RandomForestClassifier())

In [11]:
train, test = train_test_split(df_proj3, test_size=0.33, random_state=42)

In [12]:
model.fit(train["Title"].values, train["Category"].values)
pred = model.predict(test["Title"].values)

In [13]:
model2.fit(train["Title"].values, train["Category"].values)
pred2 = model2.predict(test["Title"].values)

In [14]:
model3.fit(train["Title"].values, train["Category"].values)
pred3 = model3.predict(test["Title"].values)

In [15]:
accuracy_score(y_test, pred)

NameError: name 'y_test' is not defined

In [None]:
accuracy_score(y_test, pred2)

In [None]:
accuracy_score(y_test, pred3)

In [None]:
from sklearn.metrics import confusion_matrix

mat = confusion_matrix(y_test, pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=all_cat, yticklabels=all_cat)
plt.xlabel('true label')
plt.ylabel('predicted label');

plt.show()