<a href="https://colab.research.google.com/github/terose73/ICLab/blob/main/ICLab_Analysis_ter9hb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Analyzing ICLab Censorship Data**

(1) measuring the type of censorship 

(2) measuring the change of censorship over time

(3) visualizing the censorship (e.g., location, scale) 


idea 1. map AS # to names
-> see which AS's censor the most sites

idea 2. measure censorship by website type (business, abortion, finance, etc.)

idea 3. map country codes to full country name

idea 4. see what countries censor (what website type)

**Installation / Google Drive Linking**

In [1]:
import sklearn
import sys

!{sys.executable} -m pip install pycountry
!{sys.executable} -m pip install -U libra
!{sys.executable} -m pip install urllib

import pycountry
import urllib
from libra import client
import numpy as np 
import pandas as pd
import os 
import collections

np.random.seed(2001) 

%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

Collecting pycountry
[?25l  Downloading https://files.pythonhosted.org/packages/76/73/6f1a412f14f68c273feea29a6ea9b9f1e268177d32e0e69ad6790d306312/pycountry-20.7.3.tar.gz (10.1MB)
[K     |████████████████████████████████| 10.1MB 7.6MB/s 
[?25hBuilding wheels for collected packages: pycountry
  Building wheel for pycountry (setup.py) ... [?25l[?25hdone
  Created wheel for pycountry: filename=pycountry-20.7.3-py2.py3-none-any.whl size=10746865 sha256=42f2efd7fad1afa5932acaf9733c6e1ef19d9788861df7f09cd15e77ee9c5add
  Stored in directory: /root/.cache/pip/wheels/33/4e/a6/be297e6b83567e537bed9df4a93f8590ec01c1acfbcd405348
Successfully built pycountry
Installing collected packages: pycountry
Successfully installed pycountry-20.7.3
Collecting libra
[?25l  Downloading https://files.pythonhosted.org/packages/29/73/60b2dbde0cc0505ed1fb31fca6bc940afd819b18ca49c1fd7d999b7fd7ed/libra-1.2.5-py3-none-any.whl (83kB)
[K     |████████████████████████████████| 92kB 5.4MB/s 
Collecting autocorrect


In [2]:
# Run this once you have added the ICLab data (Add shortcut to 'My Drive')
# https://drive.google.com/drive/folders/1jjSNjIIB0EWDachhaheHePXjvLQeNHkr

from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
# make sure there are csv files in the directory
!ls "/content/drive/My Drive/ICLab Data for S&P20 paper"

 iclab_2017-01.csv   iclab_2017-09.csv	 iclab_2018-05.csv
 iclab_2017-02.csv   iclab_2017-10.csv	 iclab_2018-06.csv
 iclab_2017-03.csv   iclab_2017-11.csv	 iclab_2018-07.csv
 iclab_2017-04.csv   iclab_2017-12.csv	 iclab_2018-08.csv
 iclab_2017-05.csv   iclab_2018-01.csv	 iclab_2018-09.csv
 iclab_2017-06.csv   iclab_2018-02.csv	'URL classification'
 iclab_2017-07.csv   iclab_2018-03.csv
 iclab_2017-08.csv   iclab_2018-04.csv


In [4]:
# Read in the data from My Drive symbolic link

dir = '/content/drive/My Drive/ICLab Data for S&P20 paper'

df = pd.read_csv(dir + '/iclab_2018-09.csv')

In [5]:
# Provide a quick summary of the data (features, count, type, entries)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1052887 entries, 0 to 1052886
Data columns (total 17 columns):
 #   Column            Non-Null Count    Dtype  
---  ------            --------------    -----  
 0   filename          1052887 non-null  object 
 1   server_t          1052887 non-null  object 
 2   country           1052887 non-null  object 
 3   as_number         1034724 non-null  float64
 4   schedule_name     1052887 non-null  object 
 5   url               1052887 non-null  object 
 6   dns               735638 non-null   object 
 7   dns_reason        267952 non-null   object 
 8   dns_all           998828 non-null   object 
 9   dns_reason_all    249893 non-null   object 
 10  http_status       898415 non-null   float64
 11  block             898415 non-null   object 
 12  body_len          898415 non-null   float64
 13  http_reason       898415 non-null   object 
 14  packet_updated    45244 non-null    object 
 15  packet_reason     45244 non-null    object 
 16  

In [6]:
df.describe()

Unnamed: 0,as_number,http_status,body_len
count,1034724.0,898415.0,898415.0
mean,72899.21,179.898106,84693.76
std,75235.22,111.618692,166334.4
min,1249.0,-1.0,0.0
25%,12989.0,200.0,1064.0
50%,42994.0,200.0,23415.0
75%,63008.0,200.0,91069.0
max,395111.0,999.0,3628628.0


This is not entirely useful as *as_number* and *http_status* are discrete, but *body_len* may be used for censorship or show some interesting underlying patterns.

**Data Cleanup and Feature Engineering**

In [7]:
pd.set_option('display.max_colwidth', 0)
df.agg(['unique']).transpose()

Unnamed: 0,unique
filename,"[baseline-2018-09-01T000154.371069.json.bz2, baseline-2018-10-02T011527.678225.json.bz2, baseline-2018-09-01T002206.909253.json.bz2, baseline-2018-09-01T000211.982577.json.bz2, baseline-2018-09-01T012805.179303.json.bz2, baseline-2018-09-01T000348.154035.json.bz2, baseline-2018-09-29T122200.922828.json.bz2, baseline-2018-09-01T001914.221847.json.bz2, baseline-2018-09-01T002347.327314.json.bz2, baseline-2018-09-01T022556.188689.json.bz2, baseline-2018-09-01T012507.257001.json.bz2, baseline-2018-09-16T074206.498584.json.bz2, baseline-2018-09-29T015739.798070.json.bz2, baseline-2018-09-01T012755.940830.json.bz2, baseline-2018-09-01T014549.549549.json.bz2, baseline-2018-09-01T031201.944294.json.bz2, baseline-2018-09-01T031910.129586.json.bz2, baseline-2018-09-01T041723.359026.json.bz2, baseline-2018-09-18T112730.485175.json.bz2, baseline-2018-09-01T032809.041273.json.bz2, baseline-2018-09-01T043033.228301.json.bz2, baseline-2018-09-01T033020.601453.json.bz2, baseline-2018-09-29T183142.687279.json.bz2, baseline-2018-09-01T042033.228432.json.bz2, baseline-2018-09-01T055009.914886.json.bz2, baseline-2018-09-01T045208.307428.json.bz2, baseline-2018-09-01T064204.770879.json.bz2, baseline-2018-09-01T062346.951665.json.bz2, baseline-2018-09-25T085727.507107.json.bz2, baseline-2018-09-01T070700.369570.json.bz2, baseline-2018-09-01T080554.927843.json.bz2, baseline-2018-09-01T080609.035147.json.bz2, baseline-2018-09-01T084135.726037.json.bz2, baseline-2018-09-01T094301.914785.json.bz2, baseline-2018-09-01T100717.985693.json.bz2, baseline-2018-09-01T094813.042643.json.bz2, baseline-2018-09-01T084221.770008.json.bz2, baseline-2018-09-01T093527.657129.json.bz2, baseline-2018-09-01T104617.083913.json.bz2, baseline-2018-09-01T115336.125062.json.bz2, baseline-2018-09-01T110413.611776.json.bz2, baseline-2018-09-01T112734.851861.json.bz2, baseline-2018-09-01T122429.615098.json.bz2, baseline-2018-09-01T130937.370936.json.bz2, baseline-2018-09-01T132549.694038.json.bz2, baseline-2018-09-01T142921.828216.json.bz2, baseline-2018-09-01T133024.580958.json.bz2, baseline-2018-09-01T135130.512703.json.bz2, baseline-2018-09-01T153242.171205.json.bz2, baseline-2018-09-01T155959.708269.json.bz2, baseline-2018-09-01T145819.006208.json.bz2, baseline-2018-09-01T161700.324482.json.bz2, baseline-2018-09-01T152133.399919.json.bz2, baseline-2018-09-01T162506.050253.json.bz2, baseline-2018-09-01T164325.603238.json.bz2, baseline-2018-09-01T155840.772808.json.bz2, baseline-2018-09-01T170158.251843.json.bz2, baseline-2018-09-01T155931.769552.json.bz2, baseline-2018-09-24T214632.291619.json.bz2, baseline-2018-09-01T170123.077746.json.bz2, baseline-2018-09-16T084328.850857.json.bz2, baseline-2018-09-01T175721.886001.json.bz2, baseline-2018-09-01T172643.354094.json.bz2, baseline-2018-09-01T183403.564449.json.bz2, baseline-2018-09-20T003743.820612.json.bz2, baseline-2018-09-29T221602.443699.json.bz2, baseline-2018-09-29T210024.601853.json.bz2, baseline-2018-09-01T185540.188463.json.bz2, baseline-2018-09-01T173925.731773.json.bz2, baseline-2018-09-01T200019.659990.json.bz2, baseline-2018-09-01T224830.438845.json.bz2, baseline-2018-09-01T215218.852063.json.bz2, baseline-2018-09-01T223306.519698.json.bz2, baseline-2018-09-01T233135.165997.json.bz2, baseline-2018-09-01T230625.089331.json.bz2, baseline-2018-09-02T001809.925976.json.bz2, baseline-2018-09-02T013758.958945.json.bz2, baseline-2018-09-02T002126.252146.json.bz2, baseline-2018-09-02T002131.177627.json.bz2, baseline-2018-09-02T013818.909647.json.bz2, baseline-2018-09-02T002610.042334.json.bz2, baseline-2018-09-17T144653.568304.json.bz2, baseline-2018-09-02T010613.888775.json.bz2, baseline-2018-09-02T020907.268134.json.bz2, baseline-2018-09-02T024643.458483.json.bz2, baseline-2018-09-21T170739.291503.json.bz2, baseline-2018-09-02T023737.334076.json.bz2, baseline-2018-09-02T032356.148063.json.bz2, baseline-2018-09-02T022742.763006.json.bz2, baseline-2018-09-02T030122.647772.json.bz2, baseline-2018-09-02T041404.389483.json.bz2, baseline-2018-09-02T032809.990945.json.bz2, baseline-2018-09-09T061743.632577.json.bz2, baseline-2018-09-02T042833.963892.json.bz2, baseline-2018-09-02T080340.680588.json.bz2, baseline-2018-09-02T064008.565145.json.bz2, baseline-2018-09-02T070533.565298.json.bz2, baseline-2018-09-02T075844.852194.json.bz2, baseline-2018-09-02T074751.299903.json.bz2, baseline-2018-09-02T080457.312980.json.bz2, ...]"
server_t,"[2018-09-01T00:01:54.354Z, 2018-09-01T00:02:11.575Z, 2018-09-01T00:03:48.075Z, 2018-09-01T00:19:14.000Z, 2018-09-01T00:23:47.072Z, 2018-09-01T01:25:07.161Z, 2018-09-01T01:27:55.773Z, 2018-09-01T01:45:49.431Z, 2018-09-01T02:18:29.014Z, 2018-09-01T03:19:10.096Z, 2018-09-01T03:28:08.772Z, 2018-09-01T03:30:20.522Z, 2018-09-01T04:20:32.939Z, 2018-09-01T04:52:08.236Z, 2018-09-01T06:23:46.706Z, 2018-09-01T07:07:00.270Z, 2018-09-01T08:06:08.755Z, 2018-09-01T08:41:35.582Z, 2018-09-01T08:42:21.548Z, 2018-09-01T09:35:27.573Z, 2018-09-01T10:46:16.796Z, 2018-09-01T11:04:13.418Z, 2018-09-01T11:27:34.765Z, 2018-09-01T13:09:37.099Z, 2018-09-01T13:25:49.589Z, 2018-09-01T13:30:24.503Z, 2018-09-01T13:51:30.306Z, 2018-09-01T14:35:04.415Z, 2018-09-01T14:58:18.753Z, 2018-09-01T15:21:33.097Z, 2018-09-01T15:58:40.568Z, 2018-09-01T17:01:23.023Z, 2018-09-01T17:26:43.237Z, 2018-09-01T17:39:25.665Z, 2018-09-01T20:00:19.464Z, 2018-09-01T21:52:18.712Z, 2018-09-01T22:33:06.458Z, 2018-09-01T23:06:24.801Z, 2018-09-02T00:18:09.641Z, 2018-09-02T00:21:26.167Z, 2018-09-02T00:21:31.161Z, 2018-09-02T00:26:09.751Z, 2018-09-02T01:06:13.409Z, 2018-09-02T01:29:04.646Z, 2018-09-02T01:39:08.387Z, 2018-09-02T02:27:42.683Z, 2018-09-02T03:01:22.544Z, 2018-09-02T03:19:03.845Z, 2018-09-02T03:28:09.926Z, 2018-09-02T06:40:08.311Z, 2018-09-02T07:05:33.431Z, 2018-09-02T07:47:50.994Z, 2018-09-02T08:04:57.225Z, 2018-09-02T08:22:59.165Z, 2018-09-02T08:55:42.378Z, 2018-09-02T10:11:20.061Z, 2018-09-02T10:24:36.499Z, 2018-09-02T11:25:09.648Z, 2018-09-02T15:56:55.552Z, 2018-09-02T15:59:20.492Z, 2018-09-02T16:21:36.532Z, 2018-09-02T17:08:44.623Z, 2018-09-02T18:00:52.223Z, 2018-09-02T18:03:52.057Z, 2018-09-02T19:19:25.742Z, 2018-09-02T19:28:08.019Z, 2018-09-02T20:42:58.095Z, 2018-09-02T21:41:03.342Z, 2018-09-02T21:50:38.662Z, 2018-09-02T23:31:14.142Z, 2018-09-03T00:01:16.941Z, 2018-09-03T00:01:53.313Z, 2018-09-03T00:01:57.575Z, 2018-09-03T00:09:13.896Z, 2018-09-03T00:31:18.716Z, 2018-09-03T00:36:23.385Z, 2018-09-03T01:44:16.802Z, 2018-09-03T02:18:42.255Z, 2018-09-03T02:20:22.087Z, 2018-09-03T02:46:05.815Z, 2018-09-03T02:49:16.872Z, 2018-09-03T02:54:42.369Z, 2018-09-03T02:57:49.946Z, 2018-09-03T03:16:34.547Z, 2018-09-03T03:43:44.474Z, 2018-09-03T04:25:31.994Z, 2018-09-03T06:01:21.899Z, 2018-09-03T07:23:34.344Z, 2018-09-03T07:35:18.058Z, 2018-09-03T08:50:26.472Z, 2018-09-03T11:54:52.867Z, 2018-09-03T12:03:09.827Z, 2018-09-03T12:14:29.018Z, 2018-09-03T12:28:34.971Z, 2018-09-03T14:05:58.537Z, 2018-09-03T14:46:10.510Z, 2018-09-03T15:30:24.566Z, 2018-09-03T16:05:50.862Z, 2018-09-03T16:18:36.488Z, 2018-09-03T19:03:06.150Z, ...]"
country,"[US, KR, ES, ZA, CZ, PL, MY, RU, CN, TW, BG, HK, RO, PE, HU, NO, MX, UA, NL, VN, JP, LT, RS, AU, KE, SK, IN, CL, CA, LI, SG, ID, NZ, LU, BZ, CO, TR, BR, SE, IS, FI, DZ, PT, DK, IL, MD, AT, SC]"
as_number,"[1249.0, 4766.0, 12989.0, 43578.0, 37692.0, 198605.0, 60068.0, nan, 22400.0, 54455.0, 53889.0, 43317.0, 38001.0, 30083.0, 3462.0, 59564.0, 36351.0, 9009.0, 61440.0, 29278.0, 34989.0, 45899.0, 16125.0, 63008.0, 207134.0, 395111.0, 198371.0, 45671.0, 7656.0, 201924.0, 33182.0, 16276.0, 133480.0, 199524.0, 136557.0, 51430.0, 45839.0, 7850.0, 49505.0, 53597.0, 36114.0, 37153.0, 50613.0, 51765.0, 327813.0, 55720.0, 204287.0, 11990.0, 29889.0, 29854.0, 19916.0, 46261.0, 42994.0, 43289.0, 62874.0, 3223.0, 44764.0]"
schedule_name,"[country-sensitive-at, country-sensitive-kr, citizenlab-global, alexa-global, country-sensitive-es, country-sensitive-za, country-sensitive-pl, country-sensitive-my, country-sensitive-ru, country-sensitive-cn, country-sensitive-tw, country-sensitive-hk, country-sensitive-pe, country-sensitive-hu, country-sensitive-mx, country-sensitive-ua, country-sensitive-vn, country-sensitive-jp, country-sensitive-in, country-sensitive-cl, country-sensitive-ca, country-sensitive-sg, country-sensitive-id, country-sensitive-ke, country-sensitive-co, country-sensitive-de, country-sensitive-tr, country-sensitive-br, country-sensitive-dz, country-sensitive-az, country-sensitive-md, country-sensitive-ng]"
url,"[http://kinox.to/, http://movie4k.to/, 4shared.com, news.bbc.co.uk, ngt.jinbo.net, kieu.or.kr, www.korea-publ.com, www.lineage.co.kr, www.buddy79.com, www.bukhanmall.net, www.asiantribune.com, www.yanori.co.kr, www.chaju.org, freedom.jinbo.net, vok.rep.kp, www.islamawareness.net, www.freeinternet.or.kr, play.google.com, gesomoon.com, www.dateinasia.com, english.yna.co.kr, www.jjangfile.net, naenara.com, www.aids.or.kr, cafe.daum.net, media.daum.net, www.equaline.or.kr, www.daum.net, eng.peoplepower21.org, omadam1.net, www.pullbbang.com, www.orangefile.com, www.onekorea.org, www.ccej.or.kr, www.knic.com.kp, nocensor.org, www.ddanzi.com, www.greenkorea.org, www.nicesms.co.kr, won.a.gg, www.nonghyup.com, www.ilbe.com, www.asianboygay.com, www.korean-books.com.kp, www.biomics.co.kr, www.geocities.com, www.todayhumor.co.kr, www.aindf.com, www.auction.co.kr, www.upschool.net, www.ahrchk.net, sfkuk.freeservers.com, www.88square.com, clparty.kr, pssp.org, www.nzeen.com, www.i-um.com, tvpot.media.daum.net, leftworkers.org, www.wegames.net, www.ifj-asia.org, naenara.com.kp, www.hani.co.kr, hypercortex.net, www.big.or.jp, www.chol.com, www.hellkorea.com, erozn.net, www.wevo.co.kr, data.donkeyhote.co.kr, www.cinewel.com, www.rights.or.kr, www.cherryasia.com, www.munjanara.co.kr, ppss.kr, www.dreamwiz.com, korelcfund.org.kp, www.ithaisex.com, dprk.1accesshost.com, tcafek.com, myhome.naver.com, betasia.com, 416family.org, www.dprkstudies.org, www.lxnetworks.co.kr, www.inochong.org, usacrime.or.kr, www.hotline.or.kr, cyworld.nate.com, thaicherry.com, minbyun.jinbo.net, anjali.uncovered.com, www.101sex888.com, hotasianbabes.com, www.gayasianxxx.com, tsthai.com, banyalba.com, koreanfriendfinder.com, mingkyaa.com, www.bangkokbangers.com, ...]"
dns,"[nan, false, -2, true]"
dns_reason,"[nan, sameip, no_control_resp, sameas, not_violate_threshold, no_control_as_group, normal_fail, noerror_no_answer, normal_fail,loopback, violate_threshold,no_response, violate_threshold, loopback, ip_in_other_regions,no_response, violate_threshold,normal_fail, reserved, no_response,loopback]"
dns_all,"[nan, false, -2, true]"
dns_reason_all,"[nan, no_control_resp, sameip, no_control_as_group, sameas, normal_fail, reserved, noerror_no_answer, not_violate_threshold, sameip,normal_fail,loopback, no_control_as_group,violate_threshold,no_response, no_control_as_group,violate_threshold, reserved,no_response, normal_fail,loopback, no_control_resp,loopback, sameip,loopback, ip_in_other_regions,no_response, no_control_as_group,violate_threshold,normal_fail, sameas,violate_threshold, violate_threshold,no_response, loopback, no_response,loopback]"


In [8]:
as_mappings = {}

data = urllib.request.urlopen("https://raw.githubusercontent.com/terose73/ICLab/main/as_mappings.txt?token=AIGKCL6YS7XXVB7SUXQI74C7W7MAU") 

for line in data: # files are iterable
    line = line.decode('utf-8').split(',')[0]
    as_num, name = line.split(' ', 1)
    as_mappings[as_num] = name.strip()

def find_mapping(x):
  res = as_mappings.get(x.split('.')[0])
  if not res:
    return np.nan
  return res

HTTPError: ignored

In [None]:
# convert timestamps to pandas format in dataframe
df['server_t']= df['server_t'].apply(pd.Timestamp)

# get longer country names using pycountry API
df['country']= df['country'].apply(lambda x: pycountry.countries.get(alpha_2=x).name)

# get full as_name from number
df['as_name']= df['as_number'].apply(lambda x: find_mapping(str(x)))

In [None]:
# drop unnecessary filename field
del df['filename']

In [None]:
# replace {} and [] in http_reason with NaN
df['http_reason'] = df['http_reason'].mask(df['http_reason'].apply(str).eq('{}'))
df['http_reason'] = df['http_reason'].mask(df['http_reason'].apply(str).eq('[]'))

In [None]:
# http status code of 1 doesn't make sense -> cast it to NaN

df['http_status'].mask(df['http_status'].eq(-1), inplace = True)

In [None]:
# dns and dns_all of -2 doesn't make sense (and presents typing problems) -> cast it to NaN

def is_minus_2(x):
  return x != 'false' and x != 'true'

def convert(x):
  if x == "true" or x == "True":
    return True
  elif x == "false" or x == "False":
    return False
  return pd.NA

df['dns'] = df['dns'].mask( df['dns'].apply(lambda x: is_minus_2(x)) )
df['dns_all'] = df['dns_all'].mask(df['dns_all'].apply(lambda x: is_minus_2(x)) )

In [None]:
# cast booleans to nullable type

df['dns_all'] = df['dns_all'].apply(convert).astype('boolean')
df['dns'] = df['dns'].apply(convert).astype('boolean')
df['packet_updated'] = df['packet_updated'].astype('boolean')
df['censored_updated'] = df['censored_updated'].astype('boolean')
df['block'] = df['block'].astype('boolean')

In [None]:
# cast floats to nullable integers

df['as_number'] = df['as_number'].astype("Int64")
df['http_status'] = df['http_status'].astype("Int64")
df['body_len'] = df['body_len'].astype("Int64")

In [None]:
# strip url http://, https://, and www

def strip_extra(x):
  return x.strip('http://').strip('https://').strip('/').strip('www.')

df['url'] = df['url'].apply(strip_extra)

In [None]:
url_classification = open(dir + '/URL classification/type_domain_mapping.txt','r')

c = {}

for line in url_classification.readlines():

  classification, url = line.strip().split('|||')
  c[url.strip('www.')] = classification

print(c)
df['classification']= df['url'].apply(c.get)


In [None]:
pd.set_option('display.max_colwidth', 0)
df.agg(['unique']).transpose()

In [None]:
df.to_csv('ICLab_test1.csv', index=False)

In [None]:
# Initialize Libra client
new_client = client('ICLab_test1.csv')

In [None]:
new_client.neural_network_query('model', epochs=10)