# Recursive extractor worksheet

### Installing packages

In [None]:
import sys
!{sys.executable} -m pip install itables

### Get current working directory

In [None]:
import os
print(os.getcwd())

### Load data

In [None]:
#  Load json data
import json
with open('results-1.json', 'rb') as f:
    d = json.load(f)
print(f'Total number of files = {len(d)}')

#  Get unique actions
actions = []
for i in d:
    if i.get('action') not in actions:
        actions.append(i.get('action'))
print(f'Distinct actions = {actions}')

#  Get flattened files
flattened_files = [i for i in d if i.get('action') in ('copy', 'continue')]
print(f'Flattened files = {len(flattened_files)}/{len(d)}')

#  Get binary files
binary_files = [ i for i in flattened_files if i.get('mime-encoding') == 'binary' ]
print(f'Binary files = {len(binary_files)}')

#  Get non-binary files
non_binary_files = [ i for i in flattened_files if i.get('mime-encoding') != 'binary' ]
print(f'Non-Binary files = {len(non_binary_files)}')

if len(flattened_files) == len(binary_files) + len(non_binary_files):
    print(f'All files accounted for')

### Querying MIME types

In [None]:
mime_types = []
for i in d:
    if i.get('mime-type') not in mime_types:
        mime_types.append(i.get('mime-type'))
print(f'Total distinct MIME types = {len(mime_types)}')

flattened_mime_types = []
for i in flattened_files:
    if i.get('mime-type') not in flattened_mime_types:
        flattened_mime_types.append(i.get('mime-type'))
print(f'Flattened files total distinct MIME types = {len(flattened_mime_types)}')

print(f'MIME types not present after flattened files = {[i for i in mime_types if i not in flattened_mime_types]}')

unique_mime_encodings = (i.get('mime-encoding') for i in d)
print(f'Distinct encodings {set(unique_mime_encodings)}')

In [None]:
mime_type_distinct_totals = {}
for i in flattened_files:
    mime_type_distinct_totals.update({i.get('mime-type'): mime_type_distinct_totals.get(i.get('mime-type'), 0) + 1})
print(f'{len(flattened_files)} =? {sum(mime_type_distinct_totals.values())}')
mime_type_distinct_totals

In [None]:
binary_mime_type_distinct_totals = {}
for i in binary_files:
    binary_mime_type_distinct_totals.update({i.get('mime-type'): binary_mime_type_distinct_totals.get(i.get('mime-type'), 0) + 1})
print(f'{len(binary_files)} =? {sum(binary_mime_type_distinct_totals.values())}')
binary_mime_type_distinct_totals

In [None]:
testq = [i for i in d if i.get('mime-encoding') == 'binary' and i.get('mime-type') == 'application/octet-stream']
testq

### Munging data

#### Organise data by mime-type

In [None]:
"""
#  This is slow
for i in flattened_files:
    r.update({i.get('mime-type'): r.get(i.get('mime-type', [])) + [i]})
"""

results_by_mime_type = {}  #  dict()
for i in flattened_files:
    if not results_by_mime_type.get(i.get('mime-type')):
        results_by_mime_type[i.get('mime-type')] = [i]
    else:
        results_by_mime_type.get(i.get('mime-type')).append(i)
print(f'Distinct mime-types = {len(results_by_mime_type)}')

mime_types_count_total = 0

for mime_type in results_by_mime_type:
    print(f'"{mime_type}": {len(results_by_mime_type.get(mime_type))} {results_by_mime_type.get(mime_type)[0]["mime-encoding"]}')
    mime_types_count_total += len(results_by_mime_type.get(mime_type))

print(f'{len(flattened_files)} =? {mime_types_count_total}')

#### Whitelist of mime-types to ignore

In [None]:
mime_type_whitelist = [
    #'application/gzip',
    'application/octet-stream',
    'application/pdf',
    'application/pgp-keys',
    #'application/vnd.microsoft.portable-executable',
    'application/vnd.ms-htmlhelp',
    'application/x-adobe-aco',
    #'application/x-apple-diskimage',
    'application/x-apple-rsr',
    #'application/x-bytecode.python',
    'application/x-coff',
    'application/x-dvi',
    #'application/x-executable',
    #"application/x-executable, can't read elf program headers at 4294967088",
    'application/x-gettext-translation',
    'application/x-git',
    'application/x-ima',
    #'application/x-iso9660-image',
    'application/x-java-applet',
    #'application/x-lzip',
    #'application/x-ms-pdb',
    #'application/x-object',
    #'application/x-ole-storage',
    #'application/x-pie-executable',
    #'application/x-raw-disk-image',
    #'application/x-sharedlib',
    'application/x-std-dictionary',
    #'application/x-tar',
    #'application/x-xz',
    #'application/zip',
    'audio/basic',
    'audio/x-aiff',
    'audio/x-unknown',
    'audio/x-wav',
    'image/bmp',
    'image/gif',
    'image/jpeg',
    'image/png',
    'image/tiff',
    'image/vnd.adobe.photoshop',
    'image/vnd.microsoft.icon',
    'image/webp',
    'image/x-exr',
    'image/x-icns',
    'image/x-portable-bitmap',
    'image/x-portable-greymap',
    'image/x-portable-pixmap',
    'inode/symlink',
    'inode/x-empty',
    'text/plain',
    'text/x-shellscript'
    ]

files_for_inspection = [i for i in binary_files if i.get('mime-type') not in mime_type_whitelist]

#### Visualise data

In [None]:
from IPython.display import display
import pandas as pd

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

df = pd.DataFrame(data=files_for_inspection)
#display(df.loc[:, ~df.columns.isin(['mime-encoding', 'action'])])
df.loc[:, ~df.columns.isin(['mime-encoding', 'action'])]

#### Write to `html` [results.html](/files/results.html)

In [None]:
import pandas as pd
df = pd.DataFrame(data=files_for_inspection)
s = """
<!DOCTYPE html>
<html>
<head>
<meta content="text/html;charset=utf-8" http-equiv="Content-Type">
<meta content="utf-8" http-equiv="encoding">
<title>Verification results</title>
<script>
document.addEventListener('DOMContentLoaded', function () {
  const table = document.getElementById('info-table');
  const headers = table.querySelectorAll('th');
  const tableBody = table.querySelector('tbody');
  const rows = tableBody.querySelectorAll('tr');

  // Track sort directions
  const directions = Array.from(headers).map(function (header) {
    return '';
  });

  // Transform the content of given cell in given column
  const transform = function (index, content) {
    // Get the data type of column
    const type = headers[index].getAttribute('data-type');
    switch (type) {
      case 'number':
        return parseFloat(content);
      case 'string':
      default:
        return content;
    }
  };

  const sortColumn = function (index) {
    // Get the current direction
    const direction = directions[index] || 'asc';

    // A factor based on the direction
    const multiplier = direction === 'asc' ? 1 : -1;

    const newRows = Array.from(rows);

    newRows.sort(function (rowA, rowB) {
      const cellA = rowA.querySelectorAll('td')[index].innerHTML;
      const cellB = rowB.querySelectorAll('td')[index].innerHTML;

      const a = transform(index, cellA);
      const b = transform(index, cellB);

      switch (true) {
        case a > b:
          return 1 * multiplier;
        case a < b:
          return -1 * multiplier;
        case a === b:
          return 0;
      }
    });

    // Remove old rows
    [].forEach.call(rows, function (row) {
      tableBody.removeChild(row);
    });

    // Reverse the direction
    directions[index] = direction === 'asc' ? 'desc' : 'asc';

    // Append new row
    newRows.forEach(function (newRow) {
      tableBody.appendChild(newRow);
    });
  };

  [].forEach.call(headers, function (header, index) {
    header.addEventListener('click', function () {
      sortColumn(index);
    });
  });
});
</script>

<script>
const linkify = t => {
  const isValidHttpUrl = s => {
    let u
    try {u = new URL(s)}
    catch (_) {return false}
    return u.protocol.startsWith("http")
  }
  const m = t.match(/(?<=\s|^)[a-zA-Z0-9-:/]+\.[a-zA-Z0-9-].+?(?=[.,;:?!-]?(?:\s|$))/g)
  if (!m) return t
  const a = []
  m.forEach(x => {
    const [t1, ...t2] = t.split(x)
    a.push(t1)
    t = t2.join(x)
    //const y = (!(x.match(/:\/\//)) ? 'https://' : '') + x
    const y = x
    if (isNaN(x) && isValidHttpUrl(y)) 
      //a.push('<a href="' + y + '" target="_blank">' + y.split('/')[2] + '</a>')
      a.push('<a href="' + y + '" target="_blank">' + y + '</a>')
    else
      a.push(x)
  })
  a.push(t)
  return a.join('')
}

window.onload = function(){
  document.querySelectorAll("td").forEach(o => {
    o.innerHTML = linkify(o.innerHTML)
  })
};

/*
//window.addEventListener("load", function(){
window.addEventListener("DOMContentLoaded", function(){
  document.querySelectorAll("td").forEach((i) => {
    console.log(i);
  });
});
*/

</script>

<style>
@font-face {
  font-family: satoshi;
  src: url(satoshi-regular.woff2);
}
@font-face {
  font-family: DroidSansMono;
  src: url(droid-sans-mono.woff2);
}
@font-face {
  font-family: 'Roboto Mono';
  src: url(RobotoMono-VariableFont_wght.ttf);
}
body,
.table {
  font-family: Roboto Mono, monospace;
  font-weight: 100;
  border-collapse: collapse;
}
.table__header {
  background-color: transparent;
  border: none;
  cursor: pointer;
}
.table,
.table th,
.table td {
  border: 1px solid #ccc;
  font-size: 1.0em;
}
.table th,
.table td {
  padding: 0.5rem;
}
.table th {
  cursor: pointer;
  text-decoration: underline;
}
table{
  counter-reset: rowNumber;
}
table tr > td:first-child{
  counter-increment: rowNumber;
}
table tr td:first-child::before {
  content: counter(rowNumber);
  min-width: 1em;
  margin-right: 0.5em;
}
a:link {
  color: inherit;
  text-decoration: none;
}
body {
  color: #eee;
  background: #121212;
}
body a {
  color: #809fff;
}
</style>
</head>

<body><h1>Stinky Results</h1><div><span>Total artifacts: <b>{INSERT_LENGTH}</b></span><br /></div><br />
"""
s = s.replace('{INSERT_LENGTH}', str(len(df)))
#display(HTML(s + df.to_html(columns=['mime-type', 'path', 'hash'], justify="center", classes=['table_stuff'], table_id='info-table', render_links=True)))
with open('results.html', 'w') as f:
    f.write(s + df.to_html(columns=['mime-type', 'path', 'hash'], index=False, justify="center", classes=['table', 'table-bordered', 'table-hover'], table_id='info-table', render_links=True).replace('<tr>', '<tr><td></td>').replace('<th>', '<th></th><th>', 1) + '</body>\n</html>')

#### Interesting snippets

In [None]:
from IPython.display import IFrame

IFrame(src='./nice.html', width=700, height=600)

In [None]:
from IPython.display import display, HTML

display(HTML('<h1>Hello, world!</h1>'))

js = "<script>alert('Hello World!');</script>"
display(HTML(js))

### Get redirections from proxy data

In [None]:
import json
with open('proxy.json', 'rb') as f:
    p = json.load(f)
print(f'Total number of files = "{len(p)}"')

#  Interrogate data format
next(iter(p.values()))

#### Create dictionary of source `URL`'s mapped to `Location` `HTTP response header` when a `redirect` `HTTP response status code` is found

In [None]:
#redirect = { v.get('Url'): v.get('ResponseHeader').get('Location')[0] for v in p.values() if v.get('RequestMethod') != 'HEAD' and 299 < v.get('ResponseStatusCode') < 400 }
redirect = dict()
for k, v in p.items():
    #print(f'{v}')
    if 299 < v.get('ResponseStatusCode') < 400 and v.get('RequestMethod') != 'HEAD':
        redirect.update({v.get('Url'): v.get('ResponseHeader').get('Location')[0]})
print(f'Length of dictionary = "{len(redirect)}"')
#list(redirect.items())[0]
{k: redirect[k] for k in list(redirect)[:2]}

In [None]:
def get_redirect_flow(url_string):
    redirect_flow = []
    for src_url, redirect_url in redirect.items():
        if url_string == redirect_url:
            #print(f'{url_string} was a redirection from {src_url}')
            redirect_flow.append(src_url)
            #return src_url + get_redirect_flow(src_url)
            #[] if get_redirect_flow(src_url) is None else get_redirect_flow(src_url)
            #r = get_redirect_flow(src_url)
            #if r:
            #    redirect_flow.extend(r)
            #return redirect_flow
            redirect_flow.extend(get_redirect_flow(src_url))
    return redirect_flow

result = get_redirect_flow('https://cpan.metacpan.org/authors/id/M/MI/MIYAGAWA/App-cpanminus-1.7042.tar.gz')
result[::-1]  #  result.reverse()
#  re-redirect
#redirect[redirect['http://search.cpan.org/CPAN/authors/id/M/MI/MIYAGAWA/App-cpanminus-1.7042.tar.gz']]

In [None]:
for v in p.values():
    if v.get('RequestMethod') != 'HEAD':
        r = get_redirect_flow(v.get('Url'))
        if len(r) >= 2 :
            print(f'{v.get("Url")} redirect flow {r}')
    