# Hey there!
Let's continue - now that we have the .flow file, we can parse and analyse it.

In [None]:
# Run this if you don't have the modules installed
!pip install mitmproxy pandas requests datetime json

In [None]:
import csv
from mitmproxy import io
from mitmproxy.exceptions import FlowReadException
from mitmproxy.io import FlowReader
import sys
from datetime import datetime
import pandas as pd
import json
import re
import requests

Put the filename that you chose before (e.g. appname.flow) here


In [None]:
file = "makemore.flow"
output = file.split(sep=".")[0] + ".csv"

Here we read the .flow file and turn it into a csv fo further analysis

In [None]:
def clean_bytes(data):
    if not data:
        return ""
    try:
        return data.decode("utf-8", errors="replace")
    except:
        return str(data)


with open(file, "rb") as logfile:
    freader = io.FlowReader(logfile)

    with open(output, "w", newline="", encoding="utf-8") as csvfile:
        fieldnames = [
            "timestamp", "method", "url",
            "full_request", "full_response"
        ]
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()

        for flow in freader.stream():
            req = flow.request
            res = flow.response

            # full_request = f"{req.method} {req.path} HTTP/{req.http_version}\r\n"
            full_request = "".join(f"{k}: {v}" for k, v in req.headers.items())
            # full_request += "\r\n"
            full_request += clean_bytes(req.content)

            full_response = ""
            if res:
                # full_response = f"HTTP/{res.http_version} {res.status_code} {res.reason}\r\n"
                full_response += "".join(f"{k}: {v}" for k, v in res.headers.items())
                # full_response += "\r\n"
                full_response += clean_bytes(res.content)

            writer.writerow({
                "timestamp": datetime.fromtimestamp(req.timestamp_start).isoformat(),
                "method": req.method,
                "url": req.pretty_url,
                "full_request": full_request,
                "full_response": full_response
            })

In [None]:
df = pd.read_csv(output)
# uncomment the "df" below to see the preview of the full table

# df

You'll need to put keywords that will be used for filtering here.

I put my city / country / postal code / ip address / lat + long. You can also look trhough the parameters and put something specific you want (like screen brightness).

Please note that these will be used to find exact matches (too much code and text out there for substring search).



---



You can also uncomment the last row in the cell below to find out your ip, coordinates and so on (based on your ip).

Please note that if you run it in Google Colab, it will display the data of the Google Server somewhere in the world - in that case, simply open the "url" in browser


In [None]:
keywords = ["lat", "lon", "loc", "postal",
            "Barcelona"]

url = 'https://ipinfo.io/json'
# print(requests.get(url).text)

This cell applies the filter to the table from before and adds new columns for matches.

In [None]:
pattern = r'\b(?:' + '|'.join(re.escape(k) for k in keywords) + r')\b'
regex = re.compile(pattern)

def extract_context(text, pattern, window=40):
    if pd.isna(text):
        return ""

    matches = []
    for match in pattern.finditer(text):
        start = max(match.start() - window, 0)
        end = match.end() + window
        context = text[start:end].replace("\n", " ").replace("\r", "")
        matches.append(f"...{context}...")

    return " | ".join(matches)

def extract_keywords(text, pattern):
    if pd.isna(text):
        return ""
    return " | ".join(set(match.group() for match in pattern.finditer(text)))

df["matched_in_request"] = df["full_request"].apply(lambda x: extract_context(x, regex))
df["matched_in_response"] = df["full_response"].apply(lambda x: extract_context(x, regex))
df["reason_in_request"] = df["full_request"].apply(lambda x: extract_keywords(x, regex))
df["reason_in_response"] = df["full_response"].apply(lambda x: extract_keywords(x, regex))

df_filtered = df[(df["matched_in_request"] != "") | (df["matched_in_response"] != "")]

Run the cell below if you want to see the preview of the filtered table (all rows have one or more matches with the pattern).

Scroll to the right to see the new columns:

matched_in_request - +-40 symbols surrounding the match in request

matched_in_response	- +-40 symbols surrounding the match in response

reason_in_request

reason_in_response

In the left part of the table you can also see the index of each row - you might need it later.

In [None]:
df_filtered

Now the automation stops and it's time for manual analysis.

The data formats are too different to automate this (or too sophisticated for me), + you want to filter out the false positives.

Good news: usually this table only has around 10 rows, so it's not hard to look through all of them.

The cell simply prints out the entire value of a given row (index, eg 43) and column - use column names from the table above.

When the value is too large to read trhough in plain text, I copy it to SublimeText and use the search in there.

In [None]:
df.loc[43, "full_request"]