In [14]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import numpy as np
import os
import sys
import gc
from datetime import datetime
import scipy.optimize as opt
from matplotlib import pyplot as plt

In [4]:
os.chdir("drive")
os.chdir("My Drive")
os.chdir("Colab Notebooks")
os.chdir("new_upload_Aditya")
os.listdir()

['followers.csv',
 'users.csv',
 'user_follower.csv',
 'df_trim.csv',
 'in_deg_freq.csv',
 'out_deg_freq.csv',
 'all_in_deg.csv',
 'all_out_deg.csv',
 'degreedistribution.png',
 'networkgrowth.png',
 'plot-network-growth.ipynb',
 'Degree_Distribution.ipynb',
 'UserFol.ipynb']

# Data Loading 

Reading user file

In [5]:
user = pd.read_csv('users.csv',  encoding='utf-8', error_bad_lines=False, warn_bad_lines=False, header=None, parse_dates=[4])
user.columns=['user_id', 'login', 'company', 'user_created_at', 'type','fake','deleted', 'long', 'lat', 'country_code','state', 'city', 'location']
user.head(5)

Unnamed: 0,user_id,login,company,user_created_at,type,fake,deleted,long,lat,country_code,state,city,location
0,-1,0xnoone,\N,2016-04-18 11:42:46,USR,0,1.0,\N,\N,\N,\N,\N,\N
1,1,tosch,Sage GmbH,2008-12-15 12:28:33,USR,0,0.0,0.00000000,0.00000000,\N,\N,\N,"Rastede, Germany"
2,2,jmettraux,\N,2008-03-22 00:37:42,USR,0,0.0,132.45529270,34.38520290,jp,Hiroshima Prefecture,Hiroshima,Hiroshima
3,3,SMGNMSKD,\N,2012-08-03 16:08:15,USR,1,1.0,\N,\N,\N,\N,\N,\N
4,4,kennethkalmer,@ValuationUp,2008-04-28 17:25:53,USR,0,0.0,28.04730510,-26.20410280,za,City of Johannesburg Metropolitan Municipality,Johannesburg,"Johannesburg, South Africa"


Reading follower file

In [6]:
follower_df = pd.read_csv('followers.csv', encoding='utf-8', header=None, error_bad_lines=False, warn_bad_lines=False)
follower_df.columns = ['follower_id', 'user_id', 'link_created_at']
follower_df.head(5)

Unnamed: 0,follower_id,user_id,link_created_at
0,1,2,2008-12-15 12:28:33
1,1,4,2008-12-15 12:28:33
2,1,17896,2008-12-15 12:28:33
3,1,21523,2008-12-15 12:28:33
4,1,29121,2008-12-15 12:28:33


# Data preprocessing

Following are the 3 constraints which are taken into account during pre-processing phase.

1.   Account type should be 'USR', not organization or any other. 
2.   Consider user accounts which are not deleted
3.   Neglect the fake users



### Data Limitation

Note: It is very important to apply all the 3 constraints on the data because the timestamp (user_created_at) contains invalid datetime format for many rows. which is one of the limitation of this dataset. 

Ex: ",2012-05-01 17:04:57" in this datetime string the character "," is invalid. This limitation can be overcome by applying the 3 constraints mentioned above. 

Uncomment the code below to verify the limitation of the data .

All these information has been explained on https://ghtorrent.org/relational.html in detail.

In [0]:
# user.loc[user['user_created_at'] == ",2012-05-01 17:04:57\""].index

In [0]:
# user = user.drop(index=[6658410, 6658411, 6658412, 6658413, 6658414, 6658415, 6658416])

In [0]:
# user.loc[user['user_created_at'] == ",2012-05-01 17:04:57\""].index

In [0]:
len(user)

29542941

### Preprocessing

In [7]:
user_df = pd.DataFrame()
user_df = user.loc[(user['type'] == 'USR') & (user['deleted'] == 0.0) & (user['fake'] == 0)]

print("Length of user dataframe after removing users not to be considered",len(user_df['user_id']))
#print("Number of users not deleted", user_df['deleted'].value_counts()[0])
user_df.head(5)

Length of user dataframe after removing users not to be considered 20497145


Unnamed: 0,user_id,login,company,user_created_at,type,fake,deleted,long,lat,country_code,state,city,location
1,1,tosch,Sage GmbH,2008-12-15 12:28:33,USR,0,0.0,0.0,0.0,\N,\N,\N,"Rastede, Germany"
2,2,jmettraux,\N,2008-03-22 00:37:42,USR,0,0.0,132.4552927,34.3852029,jp,Hiroshima Prefecture,Hiroshima,Hiroshima
4,4,kennethkalmer,@ValuationUp,2008-04-28 17:25:53,USR,0,0.0,28.0473051,-26.2041028,za,City of Johannesburg Metropolitan Municipality,Johannesburg,"Johannesburg, South Africa"
5,5,weppos,DNSimple,2008-04-06 08:44:35,USR,0,0.0,12.4963655,41.9027835,it,Rome,Rome,"Rome, Italy"
6,6,anb,KeyLemon,2010-02-05 06:35:04,USR,0,0.0,0.0,0.0,\N,\N,\N,\N


In [0]:
user_follower_df = pd.DataFrame()
user_follower_df['user_id'] = follower_df['user_id']
user_follower_df['follower_id'] = follower_df['follower_id']
user_follower_df['link_created_at'] = follower_df['link_created_at']

In [0]:
mergedStuff = pd.merge(user_df, user_follower_df, on=['user_id'], how='inner')

In [10]:
mergedStuff.loc[mergedStuff['user_id']==376498].count()

user_id            41660
login              41660
company            41660
user_created_at    41660
type               41660
fake               41660
deleted            41660
long               41660
lat                41660
country_code       41660
state              41660
city               41660
location           41660
follower_id        41660
link_created_at    41660
dtype: int64

In [0]:
#mergedStuff[['user_id','follower_id','link_created_at', 'user_created_at']].head()

Unnamed: 0,user_id,follower_id,link_created_at,user_created_at
0,1,2,2008-12-15 12:28:33,2008-12-15 12:28:33
1,1,4,2008-12-15 12:28:33,2008-12-15 12:28:33
2,1,6,2010-02-05 06:35:04,2008-12-15 12:28:33
3,1,2695,2009-12-10 09:30:56,2008-12-15 12:28:33
4,1,2711,2008-12-15 12:28:33,2008-12-15 12:28:33


In [11]:
# Discarding rows with same user & link creation timestamp
#cleaned_merged_df = mergedStuff.loc[mergedStuff['link_created_at']!= mergedStuff['user_created_at']]
del cleaned_merged_df
gc.collect()

NameError: ignored

In [0]:
#cleaned_merged_df[['user_id','follower_id','link_created_at', 'user_created_at']].head()

Unnamed: 0,user_id,follower_id,link_created_at,user_created_at
2,1,6,2010-02-05 06:35:04,2008-12-15 12:28:33
3,1,2695,2009-12-10 09:30:56,2008-12-15 12:28:33
7,1,47153,2009-04-29 06:49:00,2008-12-15 12:28:33
8,1,59446,2012-08-21 10:27:25,2008-12-15 12:28:33
9,1,78637,2009-04-17 05:47:37,2008-12-15 12:28:33


In [0]:
# Data limitation with respect to link_created_at 
# cleaned_merged_df.loc[cleaned_merged_df['login']=='torvalds']
# follower_df.loc[follower_df['user_id']==5203].count

In [0]:
################-------------------#################
# Use only for testing the data integrity
# grouped_follower = mergedStuff.groupby(['user_id'], sort=True).agg('count')
# temp = grouped_follower.sort_values('follower_id', ascending=False)


In [0]:
mergedStuff[['user_id','follower_id','link_created_at', 'user_created_at']].to_csv('user_follower.csv', index=False)

In [0]:
#del user_df, follower_df
del mergedStuff
gc.collect()

# Plotting 

Reading data from user-follower file

In [0]:
df = pd.read_csv('user_follower.csv', encoding='utf-8', parse_dates=False)  #pd.read_csv(filename, sep=' ', parse_dates=[2, 3])

In [13]:
df.head()

Unnamed: 0,user_id,follower_id,link_created_at,user_created_at
0,1,2,2008-12-15 12:28:33,2008-12-15 12:28:33
1,1,4,2008-12-15 12:28:33,2008-12-15 12:28:33
2,1,6,2010-02-05 06:35:04,2008-12-15 12:28:33
3,1,2695,2009-12-10 09:30:56,2008-12-15 12:28:33
4,1,2711,2008-12-15 12:28:33,2008-12-15 12:28:33


In [0]:
# Finding the timestamp difference for each link
df['ts_diff'] = (pd.to_datetime(df['link_created_at']) - pd.to_datetime(df['user_created_at'])).astype('timedelta64[M]')

In [15]:
df.shape

(29051105, 5)

In [0]:
# negative_diff_list = df[df['ts_diff']<0]['user_id'].unique()
# df = df[~df['user_id'].isin(negative_diff_list)]

df = df[ df['ts_diff'] >= 0 ]

In [17]:
df.shape

(29021515, 5)

Difference between the timestamp (link_created_at , user_created_at) is less than 0 for some rows. Therefore neglecting all the negative results before processing.

Number of links before removing links which have timestamp difference less than 0 is : 29051105

Number of links after removing links which have timestamp difference less than 0 is : 29021515

Total 29590 are dropped

In [0]:
# Check if there are no rows left with timestamp difference less than 0
df[df['ts_diff']<0]

Unnamed: 0,user_id,follower_id,link_created_at,user_created_at,ts_diff


In [20]:
print(min(df['user_created_at']))
print(max(df['user_created_at']))

2007-10-20 03:24:19
2018-09-29 07:28:44


In [21]:
print(min(df['link_created_at']))
print(max(df['link_created_at']))

2007-10-20 03:24:19
2018-09-29 17:51:24


In [0]:
def plot_growth(node_id):
    tt = np.sort(grp.get_group(node_id)['ts_diff'].values).astype(int)
    #tt -= np.min(tt)

    addup = np.append(np.bincount(tt), [0] * (maxday - len(np.bincount(tt))))
    #numerator[0:maxday] += np.cumsum(addup) / np.sum(addup)
    numerator[0:maxday] += np.cumsum(addup)


In [0]:
# group by users to get user creation count
grp = df.groupby('user_id')

In [0]:
# in-degree (# followers)
degrees = grp.user_created_at.count().values
t0 = grp.user_created_at.min().values

In [0]:
# Converting t0 to datetime datatype
t0 = t0.astype('datetime64',copy=False)

In [0]:
# levels
degree_thres = 0
data_levels = {}
data_levels['A'] = np.where(np.logical_and(degrees > degree_thres, np.logical_and(np.datetime64('2007-01-01') <= t0, t0 < np.datetime64('2010-01-01'))))
data_levels['B'] = np.where(np.logical_and(degrees > degree_thres, np.logical_and(np.datetime64('2010-01-01') <= t0, t0 < np.datetime64('2013-01-01'))))
data_levels['C'] = np.where(np.logical_and(degrees > degree_thres, np.logical_and(np.datetime64('2013-01-01') <= t0, t0 < np.datetime64('2016-01-01'))))

In [30]:
maxday = np.max(df['ts_diff'].values).astype(int) + 1

index = grp.first().index.values
fig = plt.figure(figsize = (16, 9.9))#(5.5, 4))

levels = ['A', 'B', 'C']
colors = ['C0', 'C1', 'C2']
labels = ['Years 2007-2010', 'Years 2010-2013', 'Years 2013-2016']
patterns = [None, [6, 6], [3, 3], [1, 3]]
markers = ['+', '*', 'o']
cutoff = [12, 9, 6]

<Figure size 1152x712.8 with 0 Axes>

In [1]:
maxday = np.max(df['ts_diff'].values).astype(int) + 1

index = grp.first().index.values
fig = plt.figure(figsize = (5.5, 4))#(5.5, 4))

levels = ['A', 'B', 'C']
colors = ['C0', 'C1', 'C2']
labels = ['Years 2007-2010', 'Years 2010-2013', 'Years 2013-2016']
patterns = [None, [6, 6], [3, 3], [1, 3]]
markers = ['+', '*', 'o']
cutoff = [12, 9, 6]

for l in range(len(levels)):
    level = levels[l]

    numerator = np.zeros(maxday)
    denominator = np.zeros(maxday)

    n = len(data_levels[level][0])
    count = 0

    for i in data_levels[level][0]:
#        if np.random.rand() < 1200 / n:
         plot_growth(index[i])
         count += 1

    #print (y[10])
    x = (np.arange(maxday)) / 12
    y = numerator / count
    print (y[10])

    this_line = plt.plot(x[:12*cutoff[l]], y[:12*cutoff[l]], fillstyle='none', color=colors[l], label=labels[l],  marker=markers[l])[0]
    if patterns[l] != None: this_line.set_dashes(patterns[l])


#plt.savefig('aps-inset.eps', format='eps', bbox_inches='tight')
# plt.figure(figsize =(16, 9.9))
plt.xlabel(r'Years After Joining Github', fontsize=12)
#plt.xlim([1. / 12, 55])
plt.ylabel(r'Average number of followers', fontsize=12)
plt.xscale('log')
plt.yscale('log')
plt.yticks()
#plt.ylim([0.08, 20])
plt.legend(loc='upper left', handlelength = 2.7)
# plt.title('Average number of followers of  users on GitHub')
plt.text(-0.13, 1, ' ',transform = plt.gca().transAxes, ha = 'center', va = 'center', fontsize = 17)
plt.show()
#fig.savefig('averagefollowers.png', bbox_inches = 'tight')
fig.savefig('avfoll.pdf', bbox_inches = 'tight')
#plt.savefig('aps-inset.pdf', bbox_inches = 'tight')

NameError: ignored

In [0]:
import gc
gc.collect()

109

Writing the graph into pdf file