In [1]:
import plotly.plotly as py
import pandas as pd

def country2code(t):
    df_2 = pd.read_csv(
        'reference/country2code.csv',
        names=['country','country_code'])  #reference/country2code.csv是转换器文件
    dict_country2code = df_2.set_index(
                        'country').T.to_dict('list')  #country2code的dataframe轉化為字典
    return dict_country2code[t.split(', ')[-1]][0]  #将所有记者的country转化为code

def paint_world_map(title,locations,z,text):
    data = [ dict(
            type = 'choropleth',
            locations = locations,  #國家代碼，可使用上面的方法把國家名轉化為代碼
            z = z,  #數據
            text = text,  #國家名，此處因為使用了dataframe的數據，因此所有項目都是一一對應關係
            colorscale = [[0,"rgb(0, 30, 155)"],[0.1,"rgb(15, 45, 165)"],[0.2,"rgb(30, 60, 175)"],
                          [0.3,"rgb(45, 75, 185)"],[0.4,"rgb(60, 90, 195)"],[0.5,"rgb(75, 105, 205)"],
                          [0.6,"rgb(90, 120, 215)"],[0.7,"rgb(115, 145, 225)"],[0.8,"rgb(140, 170, 235)"],
                          [0.9,"rgb(165, 195, 245)"],[1,"rgb(255, 255, 255)"]],  #深淺變化
            autocolorscale = False,  #深淺變化不符合審美，建議False
            reversescale = True,  #此項根據上面colorscale，設為True
            marker = dict(  #設置國界線
                line = dict (
                    color = 'rgb(180,180,180)',
                    width = 0.5
                ) ),
            colorbar = dict(
                autotick = True,
                title = 'amount'),
          ) ]

    layout = dict(
        title = title,  #設置標題
        geo = dict(
            showframe = False,  #設置有無地圖大邊框
            showcoastlines = False,  #設置有無海岸線
            projection = dict(
                type = 'Mercator'
            )
        )
    )

    fig = dict( data=data, layout=layout )
    return py.iplot( fig, validate=False, filename='d3-world-map' )

df_c2c = pd.read_csv(  
    'reference/code&country.csv',names=['country_code','country']
    )  #这是完整的country表

df_gh = pd.read_csv(
    'reference/journalists-data-including-github-contributions.csv'
    )  #geopy-raw.csv是上一cell中导出的记者地址原始数据
df_0d = df_gh.reindex(
    ['name','institution','city','github'],axis=1)

df_0d['country_code'] = df_gh['city'].apply(country2code)  #将所有记者的country转化为code
df_code_country_ratio = pd.DataFrame()

In [2]:
#2008以来各国数据新闻记者的总数
df_code_amount = pd.DataFrame(
    df_0d['country_code'].value_counts()
    ).rename(columns={'country_code':'journalist_sum'})
    #根據country2code，country轉化為code

df_code_country_data_j = df_c2c.join(df_code_amount, on='country_code').fillna(0) 

locations = list(df_code_country_data_j['country_code'])
text = list(df_code_country_data_j['country'])
z = list(df_code_country_data_j['journalist_sum'])
title = 'Global Data Journalist Distribution<br>Source:\
                <a href="http://jplusplus.github.io/global-directory/">\
                Global Data Journalist Directory</a>'

paint_world_map(title,locations,z,text)

In [4]:
df_code_country_data_j.sort_values(by=['journalist_sum'],axis = 0,ascending = False).head(10)

Unnamed: 0,country_code,country,journalist_sum
211,USA,United States,46.0
75,DEU,Germany,23.0
70,FRA,France,22.0
99,ITA,Italy,19.0
187,ESP,Spain,14.0
142,NLD,Netherlands,11.0
193,CHE,Switzerland,8.0
210,GBR,United Kingdom,6.0
37,CAN,Canada,6.0
27,BRA,Brazil,5.0


In [5]:
#2008以来各国contribution的总数
import numpy as np
df_0d['contribution_sum'] = df_gh.iloc[:,4:].sum(axis = 1).astype("int")
df_contribution_sum = pd.pivot_table(
    df_0d,index=['country_code'],values=['contribution_sum'],aggfunc=np.sum
    )  #通过pivot_table获得对应国家的contribution

df_code_country_data_c = df_c2c.join(df_contribution_sum, on='country_code').fillna(0) 

locations = list(df_code_country_data_c['country_code'])
text = list(df_code_country_data_c['country'])
z = list(df_code_country_data_c['contribution_sum'])
title = 'Global Data Journalist Github Contribution<br>Source:\
                <a href="http://jplusplus.github.io/global-directory/">\
                Global Data Journalist Directory</a>'

paint_world_map(title,locations,z,text)

In [8]:
df_code_country_data_c.sort_values(by=['contribution_sum'],axis = 0,ascending = False).head(10)

Unnamed: 0,country_code,country,contribution_sum
70,FRA,France,30442.0
211,USA,United States,23098.0
185,ZAF,South Africa,19932.0
187,ESP,Spain,10843.0
75,DEU,Germany,8918.0
193,CHE,Switzerland,6822.0
160,PRT,Portugal,4275.0
99,ITA,Italy,3655.0
101,JPN,Japan,3171.0
192,SWE,Sweden,3018.0


In [9]:
# 2008以来各国contribution/记者人数的比率
df_ratio = pd.DataFrame(
    df_code_amount['journalist_sum']).join(
    df_contribution_sum['contribution_sum'])
df_ratio['ratio'] = df_ratio.apply(lambda x: x[1] / x[0], axis=1)

df_code_country_data_r = df_c2c.join(df_ratio, on = 'country_code').fillna(0) 

locations = list(df_code_country_data_r['country_code'])
text = list(df_code_country_data_r['country'])
z = list(df_code_country_data_r['ratio'])
title = 'Global Data Journalist Github Contribution Ratio<br>Source:\
                <a href="http://jplusplus.github.io/global-directory/">\
                Global Data Journalist Directory</a>'

paint_world_map(title,locations,z,text)

In [70]:
df_code_country_data_r.sort_values(by=['ratio'],axis = 0,ascending = False).head(10)

Unnamed: 0,country_code,country,journalist_sum,contribution_sum,ratio
185,ZAF,South Africa,2.0,19932.0,9966.0
48,CRI,Costa Rica,1.0,2642.0,2642.0
70,FRA,France,22.0,30442.0,1383.727273
160,PRT,Portugal,4.0,4275.0,1068.75
101,JPN,Japan,3.0,3171.0,1057.0
193,CHE,Switzerland,8.0,6822.0,852.75
187,ESP,Spain,14.0,10843.0,774.5
192,SWE,Sweden,4.0,3018.0,754.5
211,USA,United States,46.0,23098.0,502.130435
12,AUT,Austria,3.0,1421.0,473.666667


In [11]:
#美国求职市场分布
import plotly.plotly as py
import pandas as pd

df_selected_jobs = pd.read_csv("reference/0 jobs.csv")
list_state_code=[]
dict_count={}
for i in df_selected_jobs["Location"]:
    try: 
        list_state_code.append(i.split(', ')[1][:2])
    except:
        pass
for i in range(len(list_state_code)):
    list_state_code[i]=list_state_code[i].upper()
    if list_state_code[i]=='D.':
        list_state_code[i]='DC'
for i in list_state_code:
    dict_count[i]=list_state_code.count(i)
code=list(dict_count.keys())
value=list(dict_count.values())

# for col in df.columns:
#     df[col] = df[col].astype(str)

    
    
scl = [[0.0, 'rgb(242,240,247)'],[0.2, 'rgb(218,218,235)'],[0.4, 'rgb(188,189,220)'],\
            [0.6, 'rgb(158,154,200)'],[0.8, 'rgb(117,107,177)'],[1.0, 'rgb(84,39,143)']]
data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = code,
        z = value,
        locationmode = 'USA-states',
        text = code,
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "how many positions")
        ) ]

layout = dict(
        title = 'U.S journalism jobs',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )
    
fig = dict( data=data, layout=layout )
py.iplot( fig, filename='d3-cloropleth-map' )

In [12]:
dict_count

{'IL': 33,
 'CO': 15,
 'DC': 75,
 'VA': 24,
 'CA': 139,
 'NE': 3,
 'MA': 33,
 'WA': 23,
 'KY': 2,
 'FL': 21,
 'NY': 159,
 'ND': 1,
 'RI': 2,
 'MO': 6,
 'PA': 18,
 'WI': 4,
 'SC': 7,
 'TX': 46,
 'MI': 6,
 'OK': 5,
 'MD': 11,
 'GA': 16,
 'NV': 4,
 'MN': 6,
 'TN': 5,
 'IN': 5,
 'AK': 1,
 'OH': 10,
 'CT': 2,
 'LA': 8,
 'NC': 14,
 'AL': 3,
 'UT': 8,
 'AZ': 6,
 'ME': 2,
 'IA': 5,
 'NJ': 6,
 'ID': 4,
 'MT': 1,
 'AR': 2,
 'KS': 4,
 'NM': 2,
 'OR': 3}