# 載入所需套件

In [1]:
import requests
import pandas as pd
import time
from tqdm import tqdm
import json

# 定義抓取可能免費IP清單函數

In [2]:
def possible_ip_crawler():
    url_list=['https://www.us-proxy.org/','https://free-proxy-list.net/']
    possible_ip_list=[]
    
    for url in tqdm(url_list):
        response=requests.get(url)
        df=pd.read_html(response.text)[0]
        df.drop((len(df)-1),axis=0,inplace=True)
        df['Port']=df['Port'].apply(lambda x:str(int(x)))
        df['IP']=df['IP Address']+':'+df['Port']
        ip_list=df['IP'].values
        possible_ip_list.extend(ip_list)
    
    possible_ip_list=list(set(possible_ip_list))
    
    return possible_ip_list

# 定義檢驗可能免費IP清單函數

In [3]:
def test_possible_ip(possible_ip_list):
    test_url='https://api.ipify.org?format=json'
    response=requests.get(test_url)
    raw_ip=response.json()['ip']
    
    ip_dict_list=[]
    for e in tqdm(possible_ip_list):
        proxies={'http':e,'https':e}
        time.sleep(0.5)
        try:
            response=requests.get(test_url,proxies=proxies,timeout=5)
            if response.json()['ip']!=raw_ip:
                ip_dict_list.append(proxies)
            else:
                pass
        except:
            pass
    
    return ip_dict_list

# 抓取可能免費IP清單

In [4]:
possible_ip_list=possible_ip_crawler()

100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:08<00:00,  3.78s/it]


# 檢驗可能免費IP清單

In [5]:
ip_dict_list=test_possible_ip(possible_ip_list)

100%|████████████████████████████████████████████████████████████████████████████████| 475/475 [35:57<00:00,  3.26s/it]


# 可用免費IP清單

In [6]:
ip_dict_list

[{'http': '37.120.214.22:8080', 'https': '37.120.214.22:8080'},
 {'http': '216.229.60.65:8080', 'https': '216.229.60.65:8080'},
 {'http': '182.253.70.252:80', 'https': '182.253.70.252:80'},
 {'http': '45.133.182.18:18080', 'https': '45.133.182.18:18080'},
 {'http': '45.230.171.91:999', 'https': '45.230.171.91:999'},
 {'http': '41.217.219.53:31398', 'https': '41.217.219.53:31398'},
 {'http': '103.126.149.33:8080', 'https': '103.126.149.33:8080'},
 {'http': '62.171.177.113:8888', 'https': '62.171.177.113:8888'},
 {'http': '202.138.242.41:46701', 'https': '202.138.242.41:46701'},
 {'http': '95.174.67.50:18080', 'https': '95.174.67.50:18080'},
 {'http': '180.248.72.200:3128', 'https': '180.248.72.200:3128'},
 {'http': '198.50.163.192:3129', 'https': '198.50.163.192:3129'},
 {'http': '122.102.27.197:23500', 'https': '122.102.27.197:23500'},
 {'http': '103.109.59.242:53281', 'https': '103.109.59.242:53281'}]

In [7]:
print('可用免費ip比例 : {}/{}={:4.2f}%'.format(len(ip_dict_list),len(possible_ip_list),(len(ip_dict_list)/len(possible_ip_list))*100))

可用免費ip比例 : 14/475=2.95%


# 儲存可用免費IP清單成Json檔

In [8]:
with open('./可用免費ip清單.json','w') as f:
    json.dump(ip_dict_list,f)