# CSCD25 Project Part 2

This file is intended to process the full dataset. We will choose a few subreddits and use data from 2019-2021 to construct datset for a more comprehensive graph in finding path.

https://tonyli1121.github.io/Reddit-Analysis/

**This file contains:**

```
- load and process full dataset
- analysis on research question 3 and 4
```

## Choosing subreddits

Choose subreddits with least freq from main_dataset (2% random sample of all data). We use the "least" frequency because:

    (1) smaller datset size
    (2) still 'one' subreddit, maintains the component
    (3) even it's the least, when using the full set it is still a large amount

In [1]:
import pandas as pd

In [2]:
main_comments = pd.read_csv('main_comments.csv.gz')

In [3]:
grouped_comments = main_comments.groupby('subreddit')

In [4]:
grouped_comments['subreddit'].value_counts().sort_values().head(5)

subreddit         subreddit       
PS5               PS5                  90611
pokemontrades     pokemontrades       101988
EscapefromTarkov  EscapefromTarkov    102499
Whatcouldgowrong  Whatcouldgowrong    103317
LivestreamFail    LivestreamFail      106799
Name: subreddit, dtype: int64

In [5]:
grouped_comments['subreddit'].value_counts().sort_values().tail(5)

subreddit       subreddit     
teenagers       teenagers          879026
wallstreetbets  wallstreetbets     916366
memes           memes             1368062
politics        politics          1392243
AskReddit       AskReddit         4041365
Name: subreddit, dtype: int64

## Loading comment

we just load subreddits ['ps5', 'pokemontrades', 'EscapefromTarkov'] in case kernal dies

we will also try to run with ['memes','politics','AskReddit']

In [6]:
import datetime
from datetime import date, timedelta
from tqdm import tqdm

start_date = datetime.date(2019, 1, 1)

end_date = datetime.date(2021, 6, 30)

delta = datetime.timedelta(days=1)

PS5 = pd.DataFrame()
pokemontrades = pd.DataFrame()
EscapefromTarkov = pd.DataFrame()
memes = pd.DataFrame()
politics = pd.DataFrame()
AskReddit = pd.DataFrame()

for i in tqdm(range((end_date - start_date).days)):  
    url_name = 'http://csslab.cs.toronto.edu/cscd25/full/comments_' + str(start_date + i*delta) +'.csv.gz'
    
    df = pd.read_csv(url_name)    
    df = df[df.author != '[deleted]'] #[deleted]
    df = df[df.author != 'AutoModerator'] #[auto moderator]
    
    PS5 = PS5.append(df.loc[df.subreddit == 'PS5'])
    pokemontrades = pokemontrades.append(df.loc[df.subreddit == 'pokemontrades'])
    EscapefromTarkov = EscapefromTarkov.append(df.loc[df.subreddit == 'EscapefromTarkov'])
    #memes = memes.append(df.loc[df.subreddit == 'memes'])
    #politics = politics.append(df.loc[df.subreddit == 'politics'])
    #AskReddit = AskReddit.append(df.loc[df.subreddit == 'AskReddit'])
    

100%|███████████████████████████████████████| 911/911 [4:14:14<00:00, 16.75s/it]


In [7]:
'''memes.to_csv('full/h1.csv')
politics.to_csv('full/h2.csv')
AskReddit.to_csv('full/h3.csv')'''

PS5.to_csv('full/l1.csv')
pokemontrades.to_csv('full/l2.csv')
EscapefromTarkov.to_csv('full/l3.csv')

## Loading submissions

In [8]:
import datetime
from datetime import date, timedelta
from tqdm import tqdm

url_name = 'http://csslab.cs.toronto.edu/cscd25/full/submissions_2019-01-01.csv.gz'
    
df = pd.read_csv(url_name)
df = df.loc[df.subreddit.isin(['PS5','pokemontrades','EscapefromTarkov'])]
df = df[['id','author']]
df = df[df.author != '[deleted]'] #[deleted]
df = df[df.author != 'AutoModerator'] #[auto moderator]
df.columns = ['link_id', 'link_author']
  
PS5 = PS5.merge(df, how='left', on='link_id')
pokemontrades = pokemontrades.merge(df, how='left', on='link_id')
EscapefromTarkov = EscapefromTarkov.merge(df, how='left', on='link_id')

start_date = datetime.date(2019, 1, 2)

end_date = datetime.date(2021, 6, 30)

delta = datetime.timedelta(days=1)

In [9]:
for i in tqdm(range((end_date - start_date).days)):  
    url_name = 'http://csslab.cs.toronto.edu/cscd25/full/submissions_' + str(start_date + i*delta) +'.csv.gz'
    
    df = pd.read_csv(url_name)
    df = df.loc[df.subreddit.isin(['PS5','pokemontrades','EscapefromTarkov'])]
    df = df[['id','author']]
    df = df[df.author != '[deleted]'] #[deleted]
    df = df[df.author != 'AutoModerator'] #[auto moderator]
    df.columns = ['link_id', 'link_author']
    
    PS5 = PS5.merge(df, how='left', on='link_id')
    PS5['link_author'] = PS5['link_author_y'].fillna(PS5['link_author_x'])
    PS5 = PS5.drop(['link_author_x','link_author_y'], axis=1)
    
    pokemontrades = pokemontrades.merge(df, how='left', on='link_id')
    pokemontrades['link_author'] = pokemontrades['link_author_y'].fillna(pokemontrades['link_author_x'])
    pokemontrades = pokemontrades.drop(['link_author_x','link_author_y'], axis=1)
    
    EscapefromTarkov = EscapefromTarkov.merge(df, how='left', on='link_id')
    EscapefromTarkov['link_author'] = EscapefromTarkov['link_author_y'].fillna(EscapefromTarkov['link_author_x'])
    EscapefromTarkov = EscapefromTarkov.drop(['link_author_x','link_author_y'], axis=1)
    '''
    memes = memes.merge(df, how='left', on='link_id')
    memes['link_author'] = memes['link_author_y'].fillna(memes['link_author_x'])
    memes = memes.drop(['link_author_x','link_author_y'], axis=1)
    
    politics = politics.merge(df, how='left', on='link_id')
    politics['link_author'] = politics['link_author_y'].fillna(politics['link_author_x'])
    politics = politics.drop(['link_author_x','link_author_y'], axis=1)
    
    AskReddit = AskReddit.merge(df, how='left', on='link_id')
    AskReddit['link_author'] = AskReddit['link_author_y'].fillna(AskReddit['link_author_x'])
    AskReddit = AskReddit.drop(['link_author_x','link_author_y'], axis=1)'''

PS5 = PS5.dropna()
pokemontrades = pokemontrades.dropna()
EscapefromTarkov = EscapefromTarkov.dropna()

100%|███████████████████████████████████████| 910/910 [4:16:32<00:00, 16.91s/it]


In [10]:
PS5.to_csv('full/l1.csv')
pokemontrades.to_csv('full/l2.csv')
EscapefromTarkov.to_csv('full/l3.csv')

## Looking at full dataset

In [1]:
import pandas as pd

PS5 = pd.read_csv('full/l1.csv',index_col=0)
#pokemontrades = pd.read_csv('full/l2.csv', index_col = 0)
#EscapefromTarkov = pd.read_csv('full/l3.csv', index_col = 0)

In [2]:
PS5.head()

Unnamed: 0,id,score,link_id,author,subreddit,created_utc,link_author
16,t1_ed160ry,12,t3_abkzn8,streakman0811,PS5,1546373679,thuper_thayan
17,t1_ed167ca,3,t3_abkzn8,HoustonRocket,PS5,1546373802,thuper_thayan
18,t1_ed168w8,1,t3_abkzn8,thuper_thayan,PS5,1546373831,thuper_thayan
19,t1_ed17v69,1,t3_abkzn8,tmcd35,PS5,1546374894,thuper_thayan
20,t1_ed1a85l,3,t3_abkzn8,Magicihan,PS5,1546376498,thuper_thayan


In [3]:
PS5.shape

(3580232, 7)

## Finding path

In [4]:
import networkx as nx
import tqdm

result = pd.DataFrame(index = ['PS5 - full','pokemontrades - full','EscapefromTarkov - full'], 
                                columns = ['Number of Authors',
                                           'Nonzero',
                                           '% Existence', 
                                           'average', 
                                           'median'])



In [28]:
samplesize = [10000, 50000]

for size in samplesize:
    tmp = PS5.sample(size,random_state = 1121)
    G = nx.from_pandas_edgelist(tmp, 'author','link_author')

    countPath = 0
    avgs = []
    max_len = 0

    subreddit = 'PS5 - ' + str(size)

    authors_list = tmp.author.append(tmp.link_author).unique().tolist()
    print('-- START ANALYZING')

    for i in tqdm.tqdm(authors_list):
        author_pathlen_dict = nx.single_source_dijkstra_path_length(G,i, cutoff=10)
        countPath += len(author_pathlen_dict)
        if len(author_pathlen_dict)== 1:
            tmp = 0
        else:
            tmp = sum(author_pathlen_dict.values()) / (len(author_pathlen_dict)-1)
        avgs.append(tmp)
        max_len = max(max_len, max(author_pathlen_dict.values()))

    result.at[subreddit, 'Number of Authors'] = len(authors_list)
    result.at[subreddit,'Nonzero'] = countPath
    result.at[subreddit,'% Existence'] = countPath/len(authors_list)**2
    result.at[subreddit,'average'] = sum(avgs)/len(authors_list)
    result.at[subreddit,'median'] = avgs[len(avgs)//2]

-- START ANALYZING


100%|████████████████████████████████████| 10839/10839 [01:18<00:00, 137.47it/s]


-- START ANALYZING


100%|█████████████████████████████████████| 34425/34425 [52:47<00:00, 10.87it/s]


In [6]:
result

Unnamed: 0,Number of Authors,Nonzero,% Existence,average,median
PS5 - full,,,,,
pokemontrades - full,,,,,
EscapefromTarkov - full,,,,,
PS5 - 10000,10839.0,35685561.0,0.303748,3.564574,6.663295
PS5 - 50000,34425.0,830961037.0,0.701185,4.055531,5.30433


In [31]:
#result.to_csv('result_full_trial.csv')

#### USE SNAP

adding nodes: https://stackoverflow.com/questions/51780621/converting-pandas-dataframe-to-snap-py

In [5]:
import snap
result = pd.read_csv('result_full_trial.csv', index_col=0)

In [7]:
samplesize = [10000, 50000]

for size in samplesize:
    tmp = PS5.sample(size,random_state = 1121)
    
    # encode into int to pass in snap graph
    author_list = tmp.author.append(tmp.link_author).unique().tolist()
    encode_int = pd.DataFrame({'author': author_list, 'author_id':range(len(author_list))})
    tmp = tmp.merge(encode_int, on = 'author')
    encode_int.columns = ['link_author','link_author_id']
    tmp = tmp.merge(encode_int, on = 'link_author')
    
    # construct graph in snap
    G1 = snap.TUNGraph.New()
    # Add nodes:
    nodes = tmp.author_id.append(tmp.link_author_id).unique().tolist()
    for node in tqdm.tqdm(nodes):
        G1.AddNode(int(node))
    # Add edges:
    for index, row in tmp.iterrows():
        G1.AddEdge(int(row['link_author_id']), int(row['author_id']))
    
    countPath = 0
    avgs = []
    max_len = 0

    subreddit = '(snap) PS5 - ' + str(size)

    print('-- START ANALYZING')

    for i in tqdm.tqdm(nodes):
        shortestPath, NIdToDistH = G1.GetShortPathAll(i, MaxDist = 10)
        countPath += len(NIdToDistH)
        if len(NIdToDistH)== 1:
            tmp = 0
        else:
            sum_length = 0
            for item in NIdToDistH:
                sum_length += NIdToDistH[item]
            tmp =  sum_length / (len(NIdToDistH)-1)
        avgs.append(tmp)

    result.at[subreddit, 'Number of Authors'] = len(nodes)
    result.at[subreddit,'Nonzero'] = countPath
    result.at[subreddit,'% Existence'] = countPath/len(nodes)**2
    result.at[subreddit,'average'] = sum(avgs)/len(nodes)
    result.at[subreddit,'median'] = avgs[len(avgs)//2]

100%|█████████████████████████████████| 10839/10839 [00:00<00:00, 864758.07it/s]


-- START ANALYZING


100%|████████████████████████████████████| 10839/10839 [01:14<00:00, 145.18it/s]
100%|█████████████████████████████████| 34425/34425 [00:00<00:00, 810240.54it/s]


-- START ANALYZING


100%|█████████████████████████████████████| 34425/34425 [29:19<00:00, 19.56it/s]


In [7]:
result

Unnamed: 0,Number of Authors,Nonzero,% Existence,average,median
PS5 - full,,,,,
pokemontrades - full,,,,,
EscapefromTarkov - full,,,,,
PS5 - 10000,10839.0,35685561.0,0.303748,3.564574,6.663295
PS5 - 50000,34425.0,830961037.0,0.701185,4.055531,5.30433


In [16]:
# encode into int to pass in snap graph
author_list = PS5.author.append(PS5.link_author).unique().tolist()
encode_int = pd.DataFrame({'author': author_list, 'author_id':range(len(author_list))})
PS5 = PS5.merge(encode_int, on = 'author')
encode_int.columns = ['link_author','link_author_id']
PS5 = PS5.merge(encode_int, on = 'link_author')
    
# construct graph in snap
G1 = snap.TUNGraph.New()
# Add nodes:
nodes = PS5.author_id.append(PS5.link_author_id).unique().tolist()
for node in nodes:
    G1.AddNode(int(node))
# Add edges:
for index, row in tqdm.tqdm(PS5.iterrows()):
    G1.AddEdge(int(row['link_author_id']), int(row['author_id']))

3580232it [02:23, 24939.54it/s]


In [34]:
countPath = 0
avgs = []
max_len = 0

subreddit = 'PS5 - full'
print('-- START ANALYZING')

for i in tqdm.tqdm(nodes):
    shortestPath, NIdToDistH = G1.GetShortPathAll(i)
    countPath += len(NIdToDistH)
    if len(NIdToDistH)== 1:
        tmp = 0
    else:
        sum_length = 0
        for item in NIdToDistH:
            sum_length += NIdToDistH[item]
        tmp =  sum_length / (len(NIdToDistH)-1)
    avgs.append(tmp)
    max_len = max(max_len, tmp)

result.at[subreddit, 'Number of Authors'] = len(nodes)
result.at[subreddit,'Nonzero'] = countPath
result.at[subreddit,'% Existence'] = countPath/len(nodes)**2
result.at[subreddit,'average'] = sum(avgs)/len(nodes)
result.at[subreddit,'median'] = avgs[len(avgs)//2]

-- START ANALYZING


100%|████████████████████████████████| 322476/322476 [57:27:42<00:00,  1.56it/s]


In [10]:
result

Unnamed: 0,Number of Authors,Nonzero,% Existence,average,median
PS5 - full,322476.0,103665300000.0,0.99687,3.451716,3.491083
pokemontrades - full,,,,,
EscapefromTarkov - full,,,,,
PS5 - 10000,10839.0,35685560.0,0.303748,3.564574,6.663295
PS5 - 50000,34425.0,830961000.0,0.701185,4.055531,5.30433
(snap) PS5 - 10000,10839.0,35685560.0,0.303748,3.564574,9.478597
(snap) PS5 - 50000,34425.0,830961000.0,0.701185,4.055531,5.393706


In [36]:
print(max_len) # max avg path length

6.550287293847253


In [9]:
result = pd.read_csv('result_full_snap.csv', index_col=0)
#a = pd.DataFrame(data = {'author':nodes, 'avg per author': avgs})
a = pd.read_csv('full/PS5_author_avg.csv', index_col = 0)

In [11]:
a

Unnamed: 0,author,avg per author
0,0,2.586837
1,1,3.076364
2,2,3.493714
3,3,3.100134
4,4,2.695310
...,...,...
322471,322047,1.000000
322472,322160,4.335972
322473,322158,1.000000
322474,322245,1.000000


In [17]:
G1.DelDegKNodes(1,1)
G1.DelDegKNodes(0,0)

In [18]:
sorted_nodes = {}
for NI in G1.Nodes():
    sorted_nodes[NI.GetId()] = NI.GetOutDeg()

sorted_nodes = {k: v for k, v in sorted(sorted_nodes.items(), key=lambda item: -item[1])}.keys()
print(len(sorted_nodes))
# sorted nodes are in descending order of degree

148422


In [22]:

countPath = 0
avgs = []
max_len = 0

subreddit = 'PS5 - filtered'
print('-- START ANALYZING')

nodes = sorted_nodes

for i in tqdm.tqdm(nodes):
    shortestPath, NIdToDistH = G1.GetShortPathAll(i)
    countPath += len(NIdToDistH)
    if len(NIdToDistH)== 1:
        tmp = 0
    else:
        sum_length = 0
        for item in NIdToDistH:
            sum_length += NIdToDistH[item]
        tmp =  sum_length / (len(NIdToDistH)-1)
    avgs.append(tmp)
    max_len = max(max_len, tmp)

result.at[subreddit, 'Number of Authors'] = len(nodes)
result.at[subreddit,'Nonzero'] = countPath
result.at[subreddit,'% Existence'] = countPath/len(nodes)**2
result.at[subreddit,'average'] = sum(avgs)/len(nodes)
result.at[subreddit,'median'] = avgs[len(avgs)//2]

-- START ANALYZING


100%|████████████████████████████████| 148422/148422 [14:53:54<00:00,  2.77it/s]


In [23]:
display(result)
a = pd.DataFrame(data = {'author':nodes, 'avg per author': avgs})
display(a.head())

print(max_len)

Unnamed: 0,Number of Authors,Nonzero,% Existence,average,median
PS5 - full,322476.0,103665300000.0,0.99687,3.451716,3.491083
pokemontrades - full,,,,,
EscapefromTarkov - full,,,,,
PS5 - 10000,10839.0,35685560.0,0.303748,3.564574,6.663295
PS5 - 50000,34425.0,830961000.0,0.701185,4.055531,5.30433
(snap) PS5 - 10000,10839.0,35685560.0,0.303748,3.564574,9.478597
(snap) PS5 - 50000,34425.0,830961000.0,0.701185,4.055531,5.393706
PS5 - filtered,148422.0,22017810000.0,0.999488,3.00237,2.635032


Unnamed: 0,author,avg per author
0,3011,1.747026
1,15519,1.925099
2,13745,1.942487
3,17731,2.042539
4,248285,2.080218


5.341353119966573


In [25]:
result.to_csv('full/result_full_filtered_snap.csv')
a.to_csv('full/PS5_filtered_author_avg.csv')
#a = pd.read_csv('full/PS5_filtered_author_avg.csv', index_col = 0)


# Analyze Active Users

In [None]:
# choose i% of data to compare
# method 1: completely random
# method 2: randomly choose subgraph after filtering top25% active users

In [115]:
importance_of_active = pd.DataFrame()

In [117]:
sampleSize = [i/100 for i in range(10,31,5)]

for size in sampleSize:
    nodes_amount = int(len(nodes) * size)
    random_nodes = random.sample(nodes, nodes_amount)
    random_without_most_nodes = random.sample(nodes[int(len(nodes) * 0.25):], nodes_amount)

    G_random = G1.GetSubGraph(random_nodes)
    G_remove_most = G1.GetSubGraph(random_without_most_nodes)

    # ======= METHOD 1: RANDOM CHOOSE NODES ============
    countPath = 0
    avgs = 0
    max_len = 0

    index = str(size)

    for i in tqdm.tqdm(random_nodes):
        shortestPath, NIdToDistH = G_random.GetShortPathAll(i)
        countPath += len(NIdToDistH)
        if len(NIdToDistH)== 1:
            tmp = 0
        else:
            sum_length = 0
            for item in NIdToDistH:
                sum_length += NIdToDistH[item]
            tmp =  sum_length / (len(NIdToDistH)-1)
        avgs = avgs+tmp

    importance_of_active.at[index,'random %'] = countPath/len(random_nodes)**2
    importance_of_active.at[index,'random avg'] = avgs/len(random_nodes)


    # ======= ANALYZE AFTER REMOVE MOST ACTIVE USERS ======
    countPath = 0
    avgs = 0
    max_len = 0

    index = str(size)

    for i in random_without_most_nodes:
        shortestPath, NIdToDistH = G_remove_most.GetShortPathAll(i)
        countPath += len(NIdToDistH)
        if len(NIdToDistH)== 1:
            tmp = 0
        else:
            sum_length = 0
            for item in NIdToDistH:
                sum_length += NIdToDistH[item]
            tmp =  sum_length / (len(NIdToDistH)-1)
        avgs = avgs+tmp

    importance_of_active.at[index,'remove most %'] = countPath/len(random_without_most_nodes)**2
    importance_of_active.at[index,'remove most avg'] = avgs/len(random_without_most_nodes)

100%|████████████████████████████████████| 14842/14842 [01:01<00:00, 242.57it/s]
100%|█████████████████████████████████████| 22263/22263 [05:18<00:00, 70.01it/s]
100%|█████████████████████████████████████| 29684/29684 [12:18<00:00, 40.22it/s]
100%|█████████████████████████████████████| 37105/37105 [16:42<00:00, 36.99it/s]
100%|█████████████████████████████████████| 44526/44526 [37:44<00:00, 19.66it/s]


In [118]:
importance_of_active

Unnamed: 0,random %,random avg,remove most %,remove most avg
0.1,0.167853,1.751088,7e-05,0.029511
0.15,0.353323,2.134793,4.7e-05,0.042747
0.2,0.443139,2.23428,3.6e-05,0.058325
0.25,0.384608,2.458569,3e-05,0.073589
0.3,0.573286,2.737417,2.6e-05,0.094055


In [119]:
importance_of_active.to_csv('full/importance_of_active.csv')

In [120]:
sampleSize = [i/100 for i in range(10,31,5)]

for size in sampleSize:
    nodes_amount = int(len(nodes) * size)
    random_without_1_nodes = random.sample(nodes[int(len(nodes) * 0.01):], nodes_amount)
    random_without_5_nodes = random.sample(nodes[int(len(nodes) * 0.05):], nodes_amount)
    random_without_10_nodes = random.sample(nodes[int(len(nodes) * 0.10):], nodes_amount)

    G_remove_1 = G1.GetSubGraph(random_without_1_nodes)
    G_remove_5 = G1.GetSubGraph(random_without_5_nodes)
    G_remove_10 = G1.GetSubGraph(random_without_10_nodes)

    # ======= remove 1% most active ============
    countPath = 0
    avgs = 0
    max_len = 0

    index = str(size)

    for i in tqdm.tqdm(random_without_1_nodes):
        shortestPath, NIdToDistH = G_remove_1.GetShortPathAll(i)
        countPath += len(NIdToDistH)
        if len(NIdToDistH)== 1:
            tmp = 0
        else:
            sum_length = 0
            for item in NIdToDistH:
                sum_length += NIdToDistH[item]
            tmp =  sum_length / (len(NIdToDistH)-1)
        avgs = avgs+tmp

    importance_of_active.at[index,'remove 1 %'] = countPath/len(random_without_1_nodes)**2
    importance_of_active.at[index,'remove 1 avg'] = avgs/len(random_without_1_nodes)


    # ======= ANALYZE AFTER REMOVE MOST ACTIVE USERS ======
    countPath = 0
    avgs = 0
    max_len = 0

    for i in tqdm.tqdm(random_without_5_nodes):
        shortestPath, NIdToDistH = G_remove_5.GetShortPathAll(i)
        countPath += len(NIdToDistH)
        if len(NIdToDistH)== 1:
            tmp = 0
        else:
            sum_length = 0
            for item in NIdToDistH:
                sum_length += NIdToDistH[item]
            tmp =  sum_length / (len(NIdToDistH)-1)
        avgs = avgs+tmp

    importance_of_active.at[index,'remove 5 %'] = countPath/len(random_without_5_nodes)**2
    importance_of_active.at[index,'remove 5 avg'] = avgs/len(random_without_5_nodes)
    
    
    # ======= REMOVE TOP 10 % active users =======
    countPath = 0
    avgs = 0
    max_len = 0

    for i in tqdm.tqdm(random_without_10_nodes):
        shortestPath, NIdToDistH = G_remove_10.GetShortPathAll(i)
        countPath += len(NIdToDistH)
        if len(NIdToDistH)== 1:
            tmp = 0
        else:
            sum_length = 0
            for item in NIdToDistH:
                sum_length += NIdToDistH[item]
            tmp =  sum_length / (len(NIdToDistH)-1)
        avgs = avgs+tmp

    importance_of_active.at[index,'remove 10 %'] = countPath/len(random_without_10_nodes)**2
    importance_of_active.at[index,'remove 10 avg'] = avgs/len(random_without_10_nodes)

100%|████████████████████████████████████| 14842/14842 [00:21<00:00, 704.38it/s]
100%|██████████████████████████████████| 14842/14842 [00:00<00:00, 48431.35it/s]
100%|██████████████████████████████████| 14842/14842 [00:00<00:00, 58584.20it/s]
100%|████████████████████████████████████| 22263/22263 [01:23<00:00, 266.90it/s]
100%|███████████████████████████████████| 22263/22263 [00:07<00:00, 3151.66it/s]
100%|██████████████████████████████████| 22263/22263 [00:00<00:00, 48202.58it/s]
100%|████████████████████████████████████| 29684/29684 [03:34<00:00, 138.11it/s]
100%|████████████████████████████████████| 29684/29684 [00:49<00:00, 604.47it/s]
100%|██████████████████████████████████| 29684/29684 [00:00<00:00, 32313.26it/s]
100%|█████████████████████████████████████| 37105/37105 [07:21<00:00, 84.07it/s]
100%|████████████████████████████████████| 37105/37105 [02:03<00:00, 301.44it/s]
100%|███████████████████████████████████| 37105/37105 [00:05<00:00, 7193.80it/s]
100%|███████████████████████

In [121]:
display(importance_of_active)

Unnamed: 0,random %,random avg,remove most %,remove most avg,remove 1 %,remove 1 avg,remove 5 %,remove 5 avg,remove 10 %,remove 10 avg
0.1,0.167853,1.751088,7e-05,0.029511,0.059149,1.558856,0.0002,0.403663,8.3e-05,0.132417
0.15,0.353323,2.134793,4.7e-05,0.042747,0.104973,1.890278,0.00877,1.812365,6.8e-05,0.21428
0.2,0.443139,2.23428,3.6e-05,0.058325,0.14635,2.101646,0.033657,2.499994,8.8e-05,0.366668
0.25,0.384608,2.458569,3e-05,0.073589,0.183109,2.261558,0.052181,2.50663,0.00177,1.3622
0.3,0.573286,2.737417,2.6e-05,0.094055,0.227715,2.447756,0.074577,2.631815,0.005715,1.877768


In [None]:
sampleSize = [1]

for size in sampleSize:
    nodes_amount = int(len(nodes) * size)
    random_without_1_nodes = random.sample(nodes[int(len(nodes) * 0.01):], nodes_amount)

    G_remove_1 = G1.GetSubGraph(random_without_1_nodes)

    # ======= remove 1% most active ============
    countPath = 0
    avgs = 0
    max_len = 0

    index = str(size)

    for i in tqdm.tqdm(random_without_1_nodes):
        shortestPath, NIdToDistH = G_remove_1.GetShortPathAll(i)
        countPath += len(NIdToDistH)
        if len(NIdToDistH)== 1:
            tmp = 0
        else:
            sum_length = 0
            for item in NIdToDistH:
                sum_length += NIdToDistH[item]
            tmp =  sum_length / (len(NIdToDistH)-1)
        avgs = avgs+tmp

    importance_of_active.at[index,'remove 1 %'] = countPath/len(random_without_1_nodes)**2
    importance_of_active.at[index,'remove 1 avg'] = avgs/len(random_without_1_nodes)


In [136]:
nodes = list(nodes)
random_without_1_nodes = list(nodes[int(len(nodes) * 0.01):])

G_remove_1 = G1.GetSubGraph(random_without_1_nodes)
countPath = 0
avgs = 0
max_len = 0

subreddit = 'PS5 - remove top 1%'

nodes = sorted_nodes

for i in tqdm.tqdm(random_without_1_nodes):
    shortestPath, NIdToDistH = G_remove_1.GetShortPathAll(i)
    countPath += len(NIdToDistH)
    if len(NIdToDistH)== 1:
        tmp = 0
    else:
        sum_length = 0
        for item in NIdToDistH:
            sum_length += NIdToDistH[item]
        tmp =  sum_length / (len(NIdToDistH)-1)
    avgs = avgs + tmp

result.at[subreddit, 'Number of Authors'] = len(random_without_1_nodes)
result.at[subreddit,'Nonzero'] = countPath
result.at[subreddit,'% Existence'] = countPath/len(random_without_1_nodes)**2
result.at[subreddit,'average'] = avgs/len(random_without_1_nodes)

100%|█████████████████████████████████| 146938/146938 [6:53:35<00:00,  5.92it/s]


In [132]:
importance_of_active.to_csv('full/importance_of_active.csv')

In [137]:
result.to_csv('full/result.csv')

In [138]:
result

Unnamed: 0,Number of Authors,Nonzero,% Existence,average,median
PS5 - full,322476.0,103665300000.0,0.99687,3.451716,3.491083
pokemontrades - full,,,,,
EscapefromTarkov - full,,,,,
PS5 - 10000,10839.0,35685560.0,0.303748,3.564574,6.663295
PS5 - 50000,34425.0,830961000.0,0.701185,4.055531,5.30433
(snap) PS5 - 10000,10839.0,35685560.0,0.303748,3.564574,9.478597
(snap) PS5 - 50000,34425.0,830961000.0,0.701185,4.055531,5.393706
PS5 - filtered,148422.0,22017810000.0,0.999488,3.00237,2.635032
PS5 - remove top 1%,146938.0,11882780000.0,0.550364,3.220571,
