In [43]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from tqdm import tqdm

In [44]:
characters_df = pd.read_csv('marvel-characters/characters.csv').merge(
    pd.read_csv('marvel-characters/charcters_stats.csv'), 
    left_on='name', 
    right_on='Name', 
    how='inner')

In [45]:
interactions_df = pd.read_csv('marvel-characters/charactersToComics.csv')

In [46]:
df = characters_df.merge(interactions_df, how='inner', on='characterID')

In [47]:
df.drop(['name'], axis=1, inplace=True)

In [48]:
df

Unnamed: 0,characterID,Name,Alignment,Intelligence,Strength,Speed,Durability,Power,Combat,Total,comicID
0,1009220,Captain America,good,63,19,35,56,46,100,319,16232
1,1009220,Captain America,good,63,19,35,56,46,100,319,16248
2,1009220,Captain America,good,63,19,35,56,46,100,319,16241
3,1009220,Captain America,good,63,19,35,56,46,100,319,16234
4,1009220,Captain America,good,63,19,35,56,46,100,319,16236
...,...,...,...,...,...,...,...,...,...,...,...
43382,1011073,Bloodaxe,bad,63,80,33,80,84,84,424,11783
43383,1011073,Bloodaxe,bad,63,80,33,80,84,84,424,11780
43384,1011073,Bloodaxe,bad,63,80,33,80,84,84,424,11779
43385,1011073,Bloodaxe,bad,63,80,33,80,84,84,424,11778


In [49]:
df['characterID'].value_counts()

1009610    3199
1009368    2388
1009718    2163
1009220    2040
1009664    1589
           ... 
1011293       2
1011088       2
1010361       1
1011349       1
1010764       1
Name: characterID, Length: 197, dtype: int64

In [50]:
df['comicID'].value_counts()

4241     59
4461     54
15902    47
5844     43
16209    39
         ..
19559     1
66670     1
3183      1
56437     1
8188      1
Name: comicID, Length: 16643, dtype: int64

In [51]:
characters_interactions = df.groupby('comicID').apply(lambda x: list(x['Name'])).values

In [52]:
print('size of the largest community: {}'.format(len(max(characters_interactions, key=lambda x: len(x)))))

size of the largest community: 59


In [53]:
edges = []
for item, row in tqdm(df.iterrows()): 
    edges.append((row['Name'], row['comicID']))

43387it [00:03, 11462.92it/s]


In [54]:
g = nx.Graph()
g.add_edges_from(edges)

In [55]:
characters_set = set(df['Name'].values)

In [56]:
projected_graph = nx.bipartite.projected_graph(g, characters_set)

In [57]:
nx.write_gexf(projected_graph, 'marvel-characters.gexf')

---

This part was after generating the modularity classes with Gephi

In [58]:
import pandas as pd

In [59]:
communities_df = pd.read_csv('marvel-communities.csv')

In [60]:
communities_df

Unnamed: 0,Id,Label,timeset,modularity_class
0,Thanos,Thanos,,0
1,Firestar,Firestar,,0
2,Apocalypse,Apocalypse,,3
3,Punisher,Punisher,,0
4,Luke Cage,Luke Cage,,0
...,...,...,...,...
191,Deathlok,Deathlok,,0
192,Hawkeye,Hawkeye,,0
193,Leader,Leader,,0
194,Annihilus,Annihilus,,0


In [61]:
df = df.merge(communities_df, left_on='Name', right_on='Id', how='inner')

In [62]:
df.head(3)

Unnamed: 0,characterID,Name,Alignment,Intelligence,Strength,Speed,Durability,Power,Combat,Total,comicID,Id,Label,timeset,modularity_class
0,1009220,Captain America,good,63,19,35,56,46,100,319,16232,Captain America,Captain America,,0
1,1009220,Captain America,good,63,19,35,56,46,100,319,16248,Captain America,Captain America,,0
2,1009220,Captain America,good,63,19,35,56,46,100,319,16241,Captain America,Captain America,,0


In [63]:
df.drop(['Id', 'Label', 'timeset', 'comicID'], axis=1, inplace=True)

In [67]:
df.drop_duplicates(inplace=True)
df.reset_index(inplace=True, drop=True)

In [68]:
df.head(3)

Unnamed: 0,characterID,Name,Alignment,Intelligence,Strength,Speed,Durability,Power,Combat,Total,modularity_class
0,1009220,Captain America,good,63,19,35,56,46,100,319,0
1,1010740,Winter Soldier,good,56,32,35,65,60,84,332,0
2,1009471,Nick Fury,good,75,11,23,42,25,100,276,0


In [69]:
df.shape

(198, 11)

In [70]:
df.to_csv('marvel_characters_communities.csv', index=False)