In [1]:
import requests
import random
import json 
import re
import os 
import base64
import time 
import glob 
import pandas as pd 
from bs4 import BeautifulSoup
import asyncio
from proxybroker import Broker
import multiprocessing
from collections import Counter
import datetime

In [9]:
def get_proxy_source1(config):
    url = 'https://www.sslproxies.org/'
    res = requests_get(url, config)
    if res:
        soup = BeautifulSoup(res['html'], 'html')
        table = soup.find('table', class_="table table-striped table-bordered")
        df = []
        if table is not None:
            cols = ['ip', 'port', 'code', 'country', 'level', 'google', 'https', 'last_checked']
            tbody = table.find('tbody')
            for tr in tbody.find_all('tr'):
                obj = dict(zip(cols, [td.text.strip() for td in tr.find_all('td')]))
                obj['level'] = obj['level'].split()[0]
                if obj['https'] == 'yes':
                    obj['https'] = True
                else:
                    obj['https'] = False
                df.append(obj)
        if df:
            df = pd.DataFrame(df)[['ip', 'port', 'country', 'level', 'https']]
            df['created'] = datetime.datetime.now().isoformat()[:10]
            return df


def get_proxy_source2(config):
    urls1 = [
        'http://free-proxy.cz/en/proxylist/country/all/http/ping/level1',
        'http://free-proxy.cz/en/proxylist/country/all/http/ping/level2',
        'http://free-proxy.cz/en/proxylist/country/all/https/ping/level1',
        'http://free-proxy.cz/en/proxylist/country/all/https/ping/level2',
    ]
    urls2 = [url+'/'+i for url in urls1 for i in '2345']
    df = []
    for url in urls1+urls2:
        res = requests_get(url, config)
        if res:
            soup = BeautifulSoup(res['html'], 'html')
            table = soup.find('table', {'id': 'proxy_list'})
            if table:
                cols = ['ip', 'port', 'https', 'country', 'region', 'city', 'level', 'speed', 'uptime', 'response', 'last_checked']
                tbody = table.find('tbody')
                for tr in tbody.find_all('tr'):
                    obj = dict(zip(cols, [td.text.strip() for td in tr.find_all('td')]))
                    if obj['level'] == 'High anonymity':
                        obj['level'] = 'elite'
                    else:
                        obj['level'] = 'anonymous'
                    if obj['https'] == 'HTTPS':
                        obj['https'] = True
                    else:
                        obj['https'] = False
                    df.append(obj)
    if df:
        df = pd.DataFrame(df)[['ip', 'port', 'country', 'level', 'https']]
        df['created'] = datetime.datetime.now().isoformat()[:10]
        return df


def get_proxy_source3(config):
    url = 'https://api.proxyscrape.com/v3/free-proxy-list/get?request=displayproxies&proxy_format=protocolipport&format=text'
    res = requests_get(url, config)
    if res:
        df = []
        for line in res['html'].split():
            if line.strip():
                obj = {'ip': line.split('//')[-1].split(':')[0],
                       'port': line.split('//')[-1].split(':')[1]}
                if line.split(':')[0] == 'http':
                    obj['https'] = False
                elif line.split(':')[0] == 'https':
                    obj['https'] = True
                else:
                    continue
                df.append(obj)
        df = pd.DataFrame(df)
        df['created'] = datetime.datetime.now().isoformat()[:10]
        return df


def requests_get(url, config=dict()):
    ntry = config.get('ntry', 1)
    verbose = config.get('verbose', False)
    for itry in range(ntry):
        if verbose:
            print(f'get({itry+1}/{ntry}):', url)
        if config.get('headers', None):
            if isinstance(config['headers'], str):
                headers = random.sample(json.load(open(config['headers'], 'r')), 1)[0]
                if verbose:
                    print('sampled one header from:', config['headers'])
            else:
                headers = config['headers']
        else:
            headers = None
        if config.get('proxies', None):
            if isinstance(config['proxies'], str):
                df = pd.read_csv(config['proxies'], dtype=str)
                for fkey, fvalue in config.get('proxies_filter', dict()).items():
                    if fkey in df.columns:
                        df = df[df[fkey].isin(fvalue)]
                if len(df) > 0:
                    irow = random.sample(list(df.index.values), 1)[0]
                    proxies = {
                        'http': 'http://'+df.loc[irow, 'ip']+':'+df.loc[irow, 'port'],
                        'https': 'https://'+df.loc[irow, 'ip']+':'+df.loc[irow, 'port'],
                    }
                    if verbose:
                        print('sampled one proxy from:', config['proxies'], '->', proxies)
                else:
                    print('No proxy meet the requirements:', config['proxies'])
                    proxies = None
            else:
                proxies = config['proxies']
        else:
            proxies = None
        try:
            t0 = time.time()
            res = requests.get(url,
                               headers=headers,
                               proxies=proxies,
                               timeout=config.get('timeout', 30), 
                               allow_redirects=config.get('allow_redirects', False),
                               verify=config.get('verify', True))
            latency = time.time()-t0
            if res.status_code == 200:
                if verbose:
                    print('Success in getting:', url, 'latency:', latency)
                return {'html': res.text, 'latency': latency}
            else:
                if verbose:
                    print(f'Abnormal status code ({res.status_code}) in getting({itry+1}/{ntry}):', url)
        except:
            if verbose:
                print(f'Failed in getting({itry+1}/{ntry}):', url)
        time.sleep(config.get('sleep', 0))


def update_proxies(sources=[get_proxy_source1, get_proxy_source3],
                   dir_source='/tmp', fsave=None, nprocess=32):
    url = 'http://www.aphanti.com/myip'
    config = {'verbose': True, 'ntry': 1, 'sleep': 0, 'timeout': 10,
              'verify': False, 'allow_redirects': False, 
              'headers': random.sample(json.load(open('../data/headers.json', 'r')), 1)[0],}    
    dfs = []
    for fn in glob.glob(dir_source+'/*.csv'):
        dfs.append(pd.read_csv(fn, dtype=str))

    for source in sources:
        df = source(config)
        fn = dir_source+'/'+source.__name__+'_'+datetime.datetime.now().isoformat()[:19]+'.csv'
        if df is not None:
            print(source.__name__, 'save', len(df), 'proxies to', fn)
            df.to_csv(fn, index=None)
            dfs.append(df)

    if dfs:
        df = pd.concat(dfs, axis=0, ignore_index=True)
        df = df.drop_duplicates(subset=['ip', 'port'])
        print('Total pool:', len(df))
        arglist = [(url, {**config, **{'proxies': {'http': 'http://'+df.loc[i, 'ip']+':'+df.loc[i, 'port'],
                                                   'https': 'https://'+df.loc[i, 'ip']+':'+df.loc[i, 'port']}}}) for i in df.index.values]
        with multiprocessing.Pool(processes=nprocess) as pool:
            results = pool.starmap(requests_get, arglist)
        df['latency'] = [x['latency'] if x else 1e9 for x in results]
        df['valid'] = [True if x else False for x in results]
        df['checked'] = datetime.datetime.now().isoformat()[:19]
    if fsave:
        os.makedirs(os.path.dirname(fsave), exist_ok=True)
        with open(fsave, 'w') as fid:
            df.to_csv(fsave, index=None)
    return df

In [None]:
update_proxies(sources=[get_proxy_source1, get_proxy_source3],
               dir_source='/home/alice/data/proxies/sources',
               fsave='/home/alice/data/proxies/checked.csv',
               nprocess=64)


get(1/1): https://www.sslproxies.org/




Success in getting: https://www.sslproxies.org/ latency: 0.2666819095611572
get(1/1): https://api.proxyscrape.com/v3/free-proxy-list/get?request=displayproxies&proxy_format=protocolipport&format=text
Success in getting: https://api.proxyscrape.com/v3/free-proxy-list/get?request=displayproxies&proxy_format=protocolipport&format=text latency: 0.3121070861816406
get_proxy_source3 save 1004 proxies to /home/alice/data/proxies/sources/get_proxy_source3_2024-07-26T15:36:17.csv
Total pool: 2074
get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1): get(1/1):get(1/1):get(1/1): get(1/1): get(1/1):get(1/1):get(1/1):   get(1/1):get(1/1): get(1/1): get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1): get(1/1):get(1/1):   get(1/1):  http://www.aphanti.com/myip  get(1/1):get(1/1): get(1/1):get(1/1):http://www.aphanti.com/myipget(1/1):  get(1/1):get(1/1):get(1/1):get(1/1):get(1/1):get(1/1): http://www.aphanti.

In [16]:
# url = 'http://free-proxy.cz/en/proxylist/country/all/https/ping/level2'
# url = 'https://proxylist.geonode.com/api/proxy-list?limit=500&page=1&sort_by=lastChecked&sort_type=desc'
# url = 'https://api.proxyscrape.com/v3/free-proxy-list/get?request=displayproxies&proxy_format=protocolipport&format=text'
url = 'http://www.aphanti.com/myip'
config = {'verbose': True, 'ntry': 1, 'sleep': 0, 'timeout': 10,
          'verify': False, 'allow_redirects': False,
          'headers': random.sample(json.load(open('../data/headers.json', 'r')), 1)[0],
          'proxies': {'http': 'http://160.86.242.23:8080', 'https': 'https://160.86.242.23:8080'}
          # 'proxies': '~/data/proxies/pool.csv',
          # 'proxies_filter': {'valid': ['True']},
         }
res = requests_get(url, config)
res['html']

get(1/1): http://www.aphanti.com/myip
Success in getting: http://www.aphanti.com/myip latency: 0.8135342597961426


'{"REMOTE_ADDR": "160.86.242.23", "REMOTE_PORT": "17612", "REQUEST_METHOD": "GET", "REQUEST_SCHEME": "http", "REQUEST_URI": "/myip", "HTTP_USER_AGENT": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:77.0) Gecko/20100101 Firefox/77.0", "datetime": "2024-07-24T13:52:39.159843", "city": "Osaka", "continent_code": "AS", "continent_name": "Asia", "country_code": "JP", "country_name": "Japan", "dma_code": null, "is_in_european_union": false, "latitude": 34.6833, "longitude": 135.5, "postal_code": "550-0001", "region": "27", "time_zone": "Asia/Tokyo"}'

In [37]:
df = pd.read_csv('/home/alice/data/proxies/checked.csv')
df.shape

<html><head><title>筷数云办公-登录</title></head><body></body></html>



In [23]:
df

In [77]:
isinstance('df', str)

True