In [3]:
import numpy as np
from scipy.optimize import curve_fit
from mylib.mysql_tool import execute_sql
from math import e

## 实验1 获取所有repo

In [4]:
import pandas as pd
repos = execute_sql("select * from repo",False)
df_repo = pd.DataFrame(repos,columns=['repo_id','repo_name'])
df_repo.index=df_repo['repo_id'] # 将repo_id列作为DataFrame的行索引
print("repos shape:",df_repo.shape)
df_repo.to_csv("repos.csv")
df_repo.head(5)

repos shape: (1363, 2)


Unnamed: 0_level_0,repo_id,repo_name
repo_id,Unnamed: 1_level_1,Unnamed: 2_level_1
100075549,100075549,Azure/autorest.aio
100118847,100118847,Azure/azure-docs-json-samples
100314903,100314903,Azure/autorest.common
100315083,100315083,Azure/autorest.modeler
100315457,100315457,Azure/autorest.azureresourceschema


## 实验2 获取所有的edge数据

In [5]:
edges = execute_sql("select * from edge",False)
df_edges = pd.DataFrame(edges,columns=['repo_i','repo_j','weight','date'])
print("edge shape:",df_edges.shape)
df_edges.to_csv("edges.csv")
df_edges.head(5)

edge shape: (449816, 4)


Unnamed: 0,repo_i,repo_j,weight,date
0,2928944,2928948,42.0,2017-01-01
1,2928944,2950981,42.0,2017-01-01
2,2928944,4127095,42.0,2017-01-01
3,2928944,6506651,42.0,2017-01-01
4,2928944,13808889,42.0,2017-01-01


## 实验3 获取想要研究的repo在全时间段的所有权重情况

### 步骤1 获取不同的repo_i列表

In [6]:
repo_i_unique=df_edges.repo_i.unique()
print("repo_i_unique shape:",repo_i_unique.shape)
print(repo_i_unique[:5])

repo_i_unique shape: (1433,)
['2928944' '2928948' '2950981' '4127088' '4127095']


### 步骤2 获取这些repo的名字

In [7]:
df_repo_unique=df_repo.loc[df_repo['repo_id'].isin(repo_i_unique )] # some_values是可迭代对象
df_repo_unique.head(5)

Unnamed: 0_level_0,repo_id,repo_name
repo_id,Unnamed: 1_level_1,Unnamed: 2_level_1
100075549,100075549,Azure/autorest.aio
100118847,100118847,Azure/azure-docs-json-samples
100314903,100314903,Azure/autorest.common
100315083,100315083,Azure/autorest.modeler
100315457,100315457,Azure/autorest.azureresourceschema


### 步骤3 查看某个仓库，在某个月的weight(包含出和入)

In [8]:
repo_id='100315083'
date='2019-11-01'
temp=df_edges[((df_edges.repo_j==repo_id) | (df_edges.repo_i==repo_id))&(df_edges.date==date) ]
#temp.head(5)
temp

Unnamed: 0,repo_i,repo_j,weight,date
197585,24960549,100315083,20.0,2019-11-01
197622,30773898,100315083,4.0,2019-11-01
197709,34483694,100315083,4.0,2019-11-01
197901,44776068,100315083,4.0,2019-11-01
198076,47575905,100315083,4.0,2019-11-01
...,...,...,...,...
202358,165910472,100315083,16.0,2019-11-01
202461,167625526,100315083,4.0,2019-11-01
202505,173348572,100315083,4.0,2019-11-01
202588,176383947,100315083,4.0,2019-11-01


### 步骤4 制作一个表格，包含了所有的要统计的仓库在2019年的weight

In [9]:
df_repo_date_weight=df_repo_unique.copy()
df_repo_date_weight=df_repo_date_weight.copy()
months_not_zeroArray = np.zeros(df_repo_date_weight.shape[0])
#df_repo_date_weight['months_zero']=0
df_repo_date_weight.head(5)

Unnamed: 0_level_0,repo_id,repo_name
repo_id,Unnamed: 1_level_1,Unnamed: 2_level_1
100075549,100075549,Azure/autorest.aio
100118847,100118847,Azure/azure-docs-json-samples
100314903,100314903,Azure/autorest.common
100315083,100315083,Azure/autorest.modeler
100315457,100315457,Azure/autorest.azureresourceschema


该函数，将数据从mysql取出，将度按时间累加

In [10]:
def get_repo_date_weight(date_list):
    #df_repo_date_weight表格增加n列，用来放置每个月的度
    for date in date_list:
        df_repo_date_weight[date]="000000"
    #df_repo_date_weight表格增加1列，用来放置多少个月度没有发生变化
    df_repo_date_weight['months_not_zero']=0
    for repo_id in repo_i_unique:
        months_not_zero=0
        #每个repo这个月都要累积上个月的weight
        sum_weight=0
        try:
            for date in date_list:
                #该仓库在某个月内，和其它所有仓库之间的weights
                weights=df_edges[((df_edges.repo_j==repo_id) | (df_edges.repo_i==repo_id))&(df_edges.date==date) ].weight
                if(weights.sum()>0):
                    months_not_zero+=1
                sum_weight+=weights.sum()
                df_repo_date_weight.loc[repo_id,date]=sum_weight
            df_repo_date_weight.loc[repo_id,'months_not_zero']=months_not_zero 
            print(df_repo_date_weight) 
        except: 
            print("err!")
    print(df_repo_date_weight)  

In [11]:
date_list = ['2017-01-01', '2017-02-01', '2017-03-01', '2017-04-01', '2017-05-01', '2017-06-01',
             '2017-07-01', '2017-08-01', '2017-09-01', '2017-10-01', '2017-11-01', '2017-12-01',
             '2018-01-01', '2018-02-01', '2018-03-01', '2018-04-01', '2018-05-01', '2018-06-01',
             '2018-07-01', '2018-08-01', '2018-09-01', '2018-10-01', '2018-11-01', '2018-12-01',
             '2019-01-01', '2019-02-01', '2019-03-01', '2019-04-01', '2019-05-01', '2019-06-01',
             '2019-07-01', '2019-08-01', '2019-09-01', '2019-10-01', '2019-11-01', '2019-12-01',
             '2020-01-01', '2020-02-01', '2020-03-01', '2020-04-01', '2020-05-01', '2020-06-01',
             '2020-07-01', '2020-08-01', '2020-09-01', '2020-10-01', '2020-11-01', '2020-12-01',
             '2021-01-01', '2021-02-01', '2021-03-01', '2021-04-01', '2021-05-01', '2021-06-01',
             '2021-07-01', '2021-08-01', '2021-09-01', '2021-10-01', '2021-11-01', '2021-12-01'
             ]
get_repo_date_weight(date_list)
df_repo_date_weight.to_csv("repo_date_weight.csv")

             repo_id                                   repo_name 2017-01-01  \
repo_id                                                                       
100075549  100075549                          Azure/autorest.aio     000000   
100118847  100118847               Azure/azure-docs-json-samples     000000   
100314903  100314903                       Azure/autorest.common     000000   
100315083  100315083                      Azure/autorest.modeler     000000   
100315457  100315457          Azure/autorest.azureresourceschema     000000   
...              ...                                         ...        ...   
99381342    99381342  Azure/service-fabric-scripts-and-templates     000000   
99610639    99610639                                Azure/simdem     000000   
99669332    99669332         Azure/iot-edge-opc-proxy-api-csharp     000000   
99830241    99830241                                Azure/kashti     000000   
99879997    99879997                        Azure/co

### 步骤5：观察网络随时间变化指标