# Theory

## Cleaning data

Data can have inconsistency like:
- outliers
- missing data
- errouneous data
- irrelevant data
- inconsistent data
- formatting


# Code

In [25]:
import re

# Em resumo regex para algumas tags htlml com conteudos interessantes
format_pat= re.compile(
    r"(?P<host>[\d\.]+)\s"
    r"(?P<identity>\S*)\s"
    r"(?P<user>\S*)\s"
    r"\[(?P<time>.*?)\]\s"
    r'"(?P<request>.*?)"\s'
    r"(?P<status>\d+)\s"
    r"(?P<bytes>\S*)\s"
    r'"(?P<referer>.*?)"\s'
    r'"(?P<user_agent>.*?)"\s*'
)


In [26]:
# Caminho do arquivo
logPath = "access_log.txt"

In [42]:
# Dicionário para contar o número de acessos a cada URL
URLCounts = {}

# Abre o arquivo em modo de leitura
with open(logPath, "r") as f:
    # Itera sobre cada linha do arquivo removendo espaços brancos
    for line in (l.rstrip() for l in f):
        # Tenta encontrar um padrão na linha usando regex
        match = format_pat.match(line)
        if match:
            # Extrai os grupos 
            access = match.groupdict()
            request = access['request']  
            # Linha de erro, pois os dados não são padronizados
            (action, URL, protocol) = request.split()
            # Atualiza a contagem para a URL no dicionário
            if URL in URLCounts:
                URLCounts[URL] = URLCounts[URL] + 1
            else:
                URLCounts[URL] = 1

# Ordena as URLs pela contagem
results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)

# Exibe as 20 URLs mais acessadas
for result in results[:20]:
    print(result + ": " + str(URLCounts[result]))


ValueError: not enough values to unpack (expected 3, got 1)

In [43]:
# Dicionário para contar o número de acessos a cada URL
URLCounts = {}

# Abre o arquivo em modo de leitura
with open(logPath, "r") as f:
    # Itera sobre cada linha do arquivo removendo espaços brancos
    for line in (l.rstrip() for l in f):
        # Tenta encontrar um padrão na linha usando regex
        match = format_pat.match(line)
        if match:
            # Extrai os grupos 
            access = match.groupdict()
            request = access['request']
            fields = request.split()
            # Verifica se a requisição tem exatamente 3 partes (ainda não é um filtro ideal)
            if len(fields) == 3:
                URL = fields[1]  # Obtém a URL
                # Atualiza a contagem da URL
                if URL in URLCounts:
                    URLCounts[URL] += 1
                else:
                    URLCounts[URL] = 1

# Ordena as URLs pela contagem
results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)

# Exibe as 20 URLs mais acessadas
for result in results[:20]:
    print(result + ": " + str(URLCounts[result]))


/xmlrpc.php: 68494
/wp-login.php: 1923
/: 440
/blog/: 138
/robots.txt: 123
/sitemap_index.xml: 118
/post-sitemap.xml: 118
/page-sitemap.xml: 117
/category-sitemap.xml: 117
/orlando-headlines/: 95
/san-jose-headlines/: 85
http://51.254.206.142/httptest.php: 81
/comics-2/: 76
/travel/: 74
/entertainment/: 72
/business/: 70
/national/: 70
/national-headlines/: 70
/world/: 70
/weather/: 70


In [45]:
# Dicionário para contar o número de acessos a cada URL
URLCounts = {}

# Abre o arquivo em modo de leitura
with open(logPath, "r") as f:
    # Itera sobre cada linha do arquivo removendo espaços brancos
    for line in (l.rstrip() for l in f):
        # Tenta encontrar um padrão na linha usando regex
        match = format_pat.match(line)
        if match:
            # Extrai os grupos 
            access = match.groupdict()
            request = access['request']
            fields = request.split()
            # Verifica se a requisição tem exatamente 3 partes
            if len(fields) == 3:
                (action, URL, protocol) = fields
                # Verifica se o método é GET
                if action == 'GET':
                    # Atualiza a contagem da URL
                    if URL in URLCounts:
                        URLCounts[URL] += 1
                    else:
                        URLCounts[URL] = 1

# Ordena as URLs pela contagem
results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)

# Exibe as 20 URLs mais acessadas
for result in results[:20]:
    print(result + ": " + str(URLCounts[result]))

/: 434
/blog/: 138
/robots.txt: 123
/sitemap_index.xml: 118
/post-sitemap.xml: 118
/page-sitemap.xml: 117
/category-sitemap.xml: 117
/orlando-headlines/: 95
/san-jose-headlines/: 85
http://51.254.206.142/httptest.php: 81
/comics-2/: 76
/travel/: 74
/entertainment/: 72
/business/: 70
/national/: 70
/national-headlines/: 70
/world/: 70
/weather/: 70
/about/: 69
/defense-sticking-head-sand/: 69


In [46]:
# Cria um dicionário para contar os User Agents
UserAgents = {}

# Abre o arquivo em modo leitura
with open(logPath, "r") as f:
    # Percorre o arquivo
    for line in (l.rstrip() for l in f):
        # Verifica se a linha corresponde ao padrão esperado
        match = format_pat.match(line)
        if match:
            # Extrai o User Agent da linha
            access = match.groupdict()
            agent = access['user_agent']
            # Atualiza a contagem do User Agent
            if agent in UserAgents:
                UserAgents[agent] += 1
            else:
                UserAgents[agent] = 1

# Ordena os User Agents pela quantidade de acesso
results = sorted(UserAgents, key=lambda i: int(UserAgents[i]), reverse=True)

# Exibe todos os User Agents
for result in results:
    print(result + ": " + str(UserAgents[result]))


Mozilla/4.0 (compatible: MSIE 7.0; Windows NT 6.0): 68484
-: 4035
Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0): 1724
W3 Total Cache/0.9.4.1: 468
Mozilla/5.0 (compatible; Baiduspider/2.0; +http://www.baidu.com/search/spider.html): 278
Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html): 248
Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.86 Safari/537.36: 158
Mozilla/5.0 (Windows NT 6.1; WOW64; rv:40.0) Gecko/20100101 Firefox/40.0: 144
Mozilla/5.0 (iPad; CPU OS 8_4 like Mac OS X) AppleWebKit/600.1.4 (KHTML, like Gecko) Version/8.0 Mobile/12H143 Safari/600.1.4: 120
Mozilla/5.0 (Linux; Android 5.1.1; SM-G900T Build/LMY47X) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/46.0.2490.76 Mobile Safari/537.36: 47
Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm): 43
Mozilla/5.0 (compatible; MJ12bot/v1.4.5; http://www.majestic12.co.uk/bot.php?+): 41
Opera/9.80 (Windows NT 6.0) Presto/2.12.388 Version/12.1

In [52]:
URLCounts = {}

# Abre o arquio
with open(logPath, "r") as f:
    # Le linha a linha
    for line in (l.rstrip() for l in f):
        # Aplica a regex
        match = format_pat.match(line)
        if match:
            # Caso passe pega a linha e guarda em um dicionario
            access = match.groupdict()
            agent = access['user_agent']
            # Verifica se não é um bot
            if (not('bot' in agent or 'spider' in agent or 
                    'Bot' in agent or 'Spider' in agent or
                    'W3 Total Cache' in agent or agent =='-')):
                # Caso não seja processa a string
                request = access['request']
                fields = request.split()
                if (len(fields) == 3):
                    (action, URL, protocol) = fields
                    # verifica se é um GET
                    if (action == 'GET'):
                        # Contabiliza a URL
                        if URL in URLCounts:
                            URLCounts[URL] = URLCounts[URL] + 1
                        else:
                            URLCounts[URL] = 1

results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)

for result in results[:20]:
    print(result + ": " + str(URLCounts[result]))

/: 77
/orlando-headlines/: 36
/?page_id=34248: 28
/wp-content/cache/minify/000000/M9AvyUjVzUstLy7PLErVz8lMKkosqtTPKtYvTi7KLCgpBgA.js: 27
/wp-content/cache/minify/000000/M9bPKixNLarUy00szs8D0Zl5AA.js: 27
/wp-content/cache/minify/000000/lY7dDoIwDIVfiG0KxkfxfnbdKO4HuxICTy-it8Zw15PzfSftzPCckJem-x4qUWArqBPl5mygZLEgyhdOaoxToGyGaiALiOfUnIz0qDLOdSZGE-nOlpc3kopDzrSyavVVt_veb5qSDVhjsQ6dHh_B_eE_z2pYIGJ7iBWKeEio_eT9UQe4xHhDll27mGRryVu_pRc.js: 27
/wp-content/cache/minify/000000/fY45DoAwDAQ_FMvkRQgFA5ZyWLajiN9zNHR0O83MRkyt-pIctqYFJPedKyYzfHg2PzOFiENAzaD07AxcpKmTolORvDjZt8KEfhBUGjZYCf8Fb0fvA1TXCw.css: 25
/?author=1: 21
/wp-content/cache/minify/000000/hcrRCYAwDAXAhXyEjiQ1YKAh4SVSx3cE7_uG7ASr4M9qg3kGWyk1adklK84LHtRj_My6Y0Pfqcz-AA.js: 20
/wp-content/uploads/2014/11/nhn1.png: 19
/wp-includes/js/wp-emoji-release.min.js?ver=4.3.1: 17
/wp-content/cache/minify/000000/BcGBCQAgCATAiUSaKYSERPk3avzuht4SkBJnt4tHJdqgnPBqKldesTcN1R8.js: 17
/wp-login.php: 16
/comics-2/: 12
/world/: 12
/favicon.ico: 10
/wp-content/up

In [53]:
URLCounts = {}

# Abre o arquio
with open(logPath, "r") as f:
    # Le linha a linha
    for line in (l.rstrip() for l in f):
        # Aplica a regex
        match = format_pat.match(line)
        if match:
            # Caso passe pega a linha e guarda em um dicionario
            access = match.groupdict()
            agent = access['user_agent']
            # Verifica se não é um bot
            if (not('bot' in agent or 'spider' in agent or 
                    'Bot' in agent or 'Spider' in agent or
                    'W3 Total Cache' in agent or agent =='-')):
                # Caso não seja processa a string
                request = access['request']
                fields = request.split()
                if (len(fields) == 3):
                    (action, URL, protocol) = fields
                    # Verifica se a URL termina em /
                    if (URL.endswith("/")):
                        # verifica se é um GET
                        if (action == 'GET'):
                            # Contabiliza a URL
                            if URL in URLCounts:
                                URLCounts[URL] = URLCounts[URL] + 1
                            else:
                                URLCounts[URL] = 1

results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)

for result in results[:20]:
    print(result + ": " + str(URLCounts[result]))

/: 77
/orlando-headlines/: 36
/comics-2/: 12
/world/: 12
/weather/: 4
/australia/: 4
/about/: 4
/national-headlines/: 3
/feed/: 2
/sample-page/feed/: 2
/science/: 2
/technology/: 2
/entertainment/: 1
/san-jose-headlines/: 1
/business/: 1
/travel/feed/: 1


In [56]:
URLCounts = {}

# Abre o arquio
with open(logPath, "r") as f:
    # Le linha a linha
    for line in (l.rstrip() for l in f):
        # Aplica a regex
        match = format_pat.match(line)
        if match:
            # Caso passe pega a linha e guarda em um dicionario
            access = match.groupdict()
            agent = access['user_agent']
            # Verifica se não é um bot
            if (not('bot' in agent or 'spider' in agent or 
                    'Bot' in agent or 'Spider' in agent or
                    'W3 Total Cache' in agent or agent =='-')):
                # Caso não seja processa a string
                request = access['request']
                fields = request.split()
                if (len(fields) == 3):
                    (action, URL, protocol) = fields
                    # Basicamente filtra a string feed
                    if('feed' not in URL):
                        # Verifica se a URL termina em /
                        if (URL.endswith("/")):
                            # verifica se é um GET
                            if (action == 'GET'):
                                # Contabiliza a URL
                                if URL in URLCounts:
                                    URLCounts[URL] = URLCounts[URL] + 1
                                else:
                                    URLCounts[URL] = 1

results = sorted(URLCounts, key=lambda i: int(URLCounts[i]), reverse=True)

for result in results[:20]:
    print(result + ": " + str(URLCounts[result]))

/: 77
/orlando-headlines/: 36
/comics-2/: 12
/world/: 12
/weather/: 4
/australia/: 4
/about/: 4
/national-headlines/: 3
/science/: 2
/technology/: 2
/entertainment/: 1
/san-jose-headlines/: 1
/business/: 1


In [61]:
# Ao meu entendimento, o 'feed' não é exatamente uma pagina visivel, mas sim utilizado 
# para dar continuidade numa pagina, perdão web não é meu forte

URLCounts = {}

# Abre o arquio
with open(logPath, "r") as f:
    # Le linha a linha
    for line in (l.rstrip() for l in f):
        # Aplica a regex
        match = format_pat.match(line)
        if match:
            # Caso passe pega a linha e guarda em um dicionario
            access = match.groupdict()
            agent = access['user_agent']
            # Verifica se não é um bot
            # if (not('bot' in agent or 'spider' in agent or 
            #         'Bot' in agent or 'Spider' in agent or
            #         'W3 Total Cache' in agent or agent =='-')):
                # Caso não seja processa a string
            request = access['request']
            fields = request.split()
            if (len(fields) == 3):
                (action, URL, protocol) = fields
                # Basicamente filtra a string feed
                if('feed' in URL):
                    print(f'{URL}')

/?feed=rss2
/feed/
/?feed=rss2
/feed/
/washington-dc-sports/feed/
/about/feed/
/sample-page/feed/
/weather/feed/
/san-francisco-sports/feed/
/feeds/tampa-bay-times-top-news/
/feed/
/?feed=rss2
/feed/
/washington-dc-sports/feed/
/feed/
/sample-page/feed/
/feed/
/san-jose-headlines/feed/
/travel/feed/
