In [2]:
import re
import socket
import pandas as pd
import requests
from bs4 import BeautifulSoup

In [9]:
data = pd.read_csv("domains.csv", sep=";")
#data.head(10)

In [10]:
domains = data.iloc[:, 2].str.lower()
domains.head()

0                   1000меню.рф
1                    24успех.рф
2                абв-клиника.рф
3              авто-перекупы.рф
4    автосервис-старый-оскол.рф
Name: домен2, dtype: object

In [17]:
for domain in domains:
  if re.match(r".*авито.*|.*sber.*", domain):
    print(domain)

авито-топ.рф
podpiskasber.ru
meetup-sberhealth-frontend.ru
sber-ins.ru
aisberg-store.ru
sberkidscity.ru
mensberth.ru
solomonalsberg.ru
sber-biz.ru
sbersoftdev.ru
sberioglstics.ru
sminfacryhosbergco.ru


In [18]:
def filter_domains(domains: list, regular_expression: str):
  return [domain for domain in domains if re.match(regular_expression, domain)]

vtb_regex = r".*vtb.*"
full_regex = r".*avito.*|.*авито.*|.*sber.*|.*сбер.*"
filtered_domains = filter_domains(domains, full_regex)
filtered_domains

['меетуп-сберхеалтх-фронтенд.рф',
 'авито-топ.рф',
 'avito-message.ru',
 'avito-messages.ru',
 'avito-sells.ru',
 'avitosell.ru',
 'avitosells.ru',
 'podpiskasber.ru',
 'prof-avitolog.ru',
 'sells-avito.ru',
 'kupit-otzivi-avito.ru',
 'meetup-sberhealth-frontend.ru',
 'nakrutka-avito.ru',
 'sber-ins.ru',
 'aisberg-store.ru',
 'avito-automarketolog.ru',
 'ryazantsevaproavito.ru',
 'sberkidscity.ru',
 'avito-id274653.ru',
 'avito-otzivi.ru',
 'avito-scn.ru',
 'avitolog31.ru',
 'mensberth.ru',
 'avito-id82991.ru',
 'avito-note.ru',
 'avito-notes.ru',
 'avito-notification.ru',
 'avito-notifications.ru',
 'solomonalsberg.ru',
 'avito-lifehack.ru',
 'proavitologkurs-bot.ru',
 'profavito.ru',
 'sber-biz.ru',
 'sbersoftdev.ru',
 'tvoyavitolog.ru',
 'sberioglstics.ru',
 'sminfacryhosbergco.ru']

In [19]:
def send_response(domain: str) -> requests.Response:
  try:
    response = requests.get("https://" + domain, timeout=1)
    return response
  except Exception:
    pass

  try:
    response = requests.get("http://" + domain, timeout=1)
    return response
  except Exception:
    pass

  return 'Error'

def get_status_code(response: requests.Response) -> int:
  if isinstance(response, requests.Response):
    return response.status_code
  else:
    return "Error"
  
def get_title(response: requests.Response) -> str:

  if isinstance(response, requests.Response):
      soup = BeautifulSoup(response.text, 'html.parser') 

      title_tag = soup.find('title')

      if title_tag:
        title = title_tag.text
        return title
      else:
        return "Title not found"
      
  else:
    return "Error"
  
def get_redirect(response: requests.Response) -> list[str]:
  redirects = []
  if isinstance(response, requests.Response):
      if response.history:
        for resp in response.history:
          redirects.append([resp.status_code, resp.url])
  return redirects

def get_ip(domain: str) -> str:
  try:
    ip_address = socket.gethostbyname(domain)
    return ip_address
  
  except Exception as e:
    return "Error"
  
def get_whois_info(domain: str) -> list[str]:
  rst_whois_apikey = "wzsGdJ9nstr74EVdlAaaCjPP9vZWv0so5GLk2wwbt1P0yGLv1jWMzg"

  headers = {
      'accept': 'application/json',
      'x-api-key': rst_whois_apikey,
  }

  response = requests.get(f'https://api.rstcloud.net/v1/whois/{domain}', headers=headers)

  if response.status_code == 200:
    whois_data = response.json()
    return whois_data.get('created_on'), whois_data.get("registrar")
  else:
    return "Unknown", "Unknown"
  
def get_info_about_domain(domain: str):
  response = send_response(domain)
  status_code = get_status_code(response)
  title = get_title(response)
  redirects = get_redirect(response)
  ip = get_ip(domain)
  created_on, registrar = get_whois_info(domain)
  return domain, status_code, title, redirects, ip, created_on, registrar

In [None]:
get_info_about_domain("google.com")

In [20]:
# full_data = []

# for domain in filtered_domains:
#   data = get_info_about_domain(domain)
#   full_data.append(data)

full_data = [get_info_about_domain(domain) for domain in filtered_domains]
full_data

[('меетуп-сберхеалтх-фронтенд.рф',
  200,
  'Хостинг-провайдер TimeWeb.ru | Мы рады приветствовать Вас среди наших клиентов!',
  [],
  '92.53.96.244',
  'Unknown',
  'Unknown'),
 ('авито-топ.рф', 403, '403', [], '185.215.4.41', 'Unknown', 'Unknown'),
 ('avito-message.ru',
  204,
  'Title not found',
  [],
  '104.21.96.52',
  '2024-03-16 18:53:20',
  'unknown'),
 ('avito-messages.ru',
  204,
  'Title not found',
  [],
  '104.21.20.241',
  '2024-03-16 18:53:20',
  'unknown'),
 ('avito-sells.ru',
  204,
  'Title not found',
  [],
  '104.21.67.213',
  '2024-03-16 18:53:20',
  'unknown'),
 ('avitosell.ru',
  204,
  'Title not found',
  [],
  '172.67.139.153',
  '2024-03-16 18:53:20',
  'unknown'),
 ('avitosells.ru',
  204,
  'Title not found',
  [],
  '172.67.163.186',
  '2024-03-16 18:53:20',
  'unknown'),
 ('podpiskasber.ru',
  'Error',
  'Error',
  [],
  'Error',
  '2024-03-16 05:32:07',
  'unknown'),
 ('prof-avitolog.ru',
  'Error',
  'Error',
  [],
  'Error',
  '2024-03-16 15:31:21',
 

In [21]:
column_names = ["domain", "status_code", "title", "redirects", "ip", "created_on", "registrar"]
df = pd.DataFrame(full_data, columns=column_names)
df.head(15)

Unnamed: 0,domain,status_code,title,redirects,ip,created_on,registrar
0,меетуп-сберхеалтх-фронтенд.рф,200,Хостинг-провайдер TimeWeb.ru | Мы рады приветс...,[],92.53.96.244,Unknown,Unknown
1,авито-топ.рф,403,403,[],185.215.4.41,Unknown,Unknown
2,avito-message.ru,204,Title not found,[],104.21.96.52,2024-03-16 18:53:20,unknown
3,avito-messages.ru,204,Title not found,[],104.21.20.241,2024-03-16 18:53:20,unknown
4,avito-sells.ru,204,Title not found,[],104.21.67.213,2024-03-16 18:53:20,unknown
5,avitosell.ru,204,Title not found,[],172.67.139.153,2024-03-16 18:53:20,unknown
6,avitosells.ru,204,Title not found,[],172.67.163.186,2024-03-16 18:53:20,unknown
7,podpiskasber.ru,Error,Error,[],Error,2024-03-16 05:32:07,unknown
8,prof-avitolog.ru,Error,Error,[],Error,2024-03-16 15:31:21,unknown
9,sells-avito.ru,204,Title not found,[],104.21.47.233,2024-03-16 18:53:20,unknown


In [22]:
df.to_csv("domains_result.csv", index=False)

In [None]:
data = pd.read_csv("domains.csv")
data.head(15)

In [None]:
data[data["domain"] == "vtb-personal-app.ru"]

In [None]:
data[data["status_code"] == "Error"]

In [None]:
data.nunique()

In [None]:
data.describe()