In [10]:
#!/usr/bin/env python3

"""
Reference collector script for NetFlow v1, v5, and v9 Python package.
This file belongs to https://github.com/bitkeks/python-netflow-v9-softflowd.
Copyright 2016-2020 Dominik Pataky <software+pynetflow@dpataky.eu>
Licensed under MIT License. See LICENSE.
"""
import argparse
import gzip
import json
import logging
import queue
import socket
import socketserver
import threading
import time
import asyncio
import nest_asyncio
nest_asyncio.apply()
from collections import namedtuple


from package.ipfix import IPFIXTemplateNotRecognized
#from package.utils import *
from package.utils import UnknownExportVersion, parse_packet, flow_filter_v4, flow_filter_v6
from package.v9 import V9TemplateNotRecognized
from package.mysql_os import MysqlOperation
from package.influxdb_os import InsertRecords


RawPacket = namedtuple('RawPacket', ['ts', 'client', 'data'])
ParsedPacket = namedtuple('ParsedPacket', ['ts', 'client', 'export'])

# Amount of time to wait before dropping an undecodable ExportPacket
PACKET_TIMEOUT = 60 * 60

logger = logging.getLogger("netflow-collector")
ch = logging.StreamHandler()
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)


class QueuingRequestHandler(socketserver.BaseRequestHandler):
    def handle(self):
        data = self.request[0]  # get content, [1] would be the socket
        self.server.queue.put(RawPacket(time.time(), self.client_address, data))
        logger.debug(
            "Received %d bytes of data from %s", len(data), self.client_address
        )


class QueuingUDPListener(socketserver.ThreadingUDPServer):
    """A threaded UDP server that adds a (time, data) tuple to a queue for
    every request it sees
    """

    def __init__(self, interface, queue):
        self.queue = queue

        # If IPv6 interface addresses are used, override the default AF_INET family
        if ":" in interface[0]:
            self.address_family = socket.AF_INET6

        super().__init__(interface, QueuingRequestHandler)


class ThreadedNetFlowListener(threading.Thread):
    """A thread that listens for incoming NetFlow packets, processes them, and
    makes them available to consumers.
    - When initialized, will start listening for NetFlow packets on the provided
      host and port and queuing them for processing.
    - When started, will start processing and parsing queued packets.
    - When stopped, will shut down the listener and stop processing.
    - When joined, will wait for the listener to exit
    For example, a simple script that outputs data until killed with CTRL+C:
    >>> listener = ThreadedNetFlowListener('0.0.0.0', 2055)
    >>> print("Listening for NetFlow packets")
    >>> listener.start() # start processing packets
    >>> try:
    ...     while True:
    ...         ts, export = listener.get()
    ...         print("Time: {}".format(ts))
    ...         for f in export.flows:
    ...             print(" - {IPV4_SRC_ADDR} sent data to {IPV4_DST_ADDR}"
    ...                   "".format(**f))
    ... finally:
    ...     print("Stopping...")
    ...     listener.stop()
    ...     listener.join()
    ...     print("Stopped!")
    """

    def __init__(self, host: str, port: int):
        logger.info("Starting the NetFlow listener on {}:{}".format(host, port))
        self.output = queue.Queue()
        self.input = queue.Queue()
        self.server = QueuingUDPListener((host, port), self.input)
        self.thread = threading.Thread(target=self.server.serve_forever)
        self.thread.start()
        self._shutdown = threading.Event()
        super().__init__()

    def get(self, block=True, timeout=None) -> ParsedPacket:
        """Get a processed flow.
        If optional args 'block' is true and 'timeout' is None (the default),
        block if necessary until a flow is available. If 'timeout' is
        a non-negative number, it blocks at most 'timeout' seconds and raises
        the queue.Empty exception if no flow was available within that time.
        Otherwise ('block' is false), return a flow if one is immediately
        available, else raise the queue.Empty exception ('timeout' is ignored
        in that case).
        """
        return self.output.get(block, timeout)

    def run(self):
        # Process packets from the queue
        try:
            templates = {"netflow": {}, "ipfix": {}}
            to_retry = []
            while not self._shutdown.is_set():
                try:
                    # 0.5s delay to limit CPU usage while waiting for new packets
                    pkt = self.input.get(block=True, timeout=0.5)  # type: RawPacket
                except queue.Empty:
                    continue

                try:
                    # templates is passed as reference, updated in V9ExportPacket
                    export = parse_packet(pkt.data, templates)
                except UnknownExportVersion as e:
                    logger.error("%s, ignoring the packet", e)
                    continue
                except (V9TemplateNotRecognized, IPFIXTemplateNotRecognized):
                    # TODO: differentiate between v9 and IPFIX, use separate to_retry lists
                    if time.time() - pkt.ts > PACKET_TIMEOUT:
                        logger.warning("Dropping an old and undecodable v9/IPFIX ExportPacket")
                    else:
                        to_retry.append(pkt)
                        logger.debug("Failed to decode a v9/IPFIX ExportPacket - will "
                                     "re-attempt when a new template is discovered")
                    continue

                if export.header.version == 10:
                    logger.debug("Processed an IPFIX ExportPacket with length %d.", export.header.length)
                else:
                    logger.debug("Processed a v%d ExportPacket with %d flows.",
                                 export.header.version, export.header.count)

                # If any new templates were discovered, dump the unprocessable
                # data back into the queue and try to decode them again
                if export.header.version in [9, 10] and export.contains_new_templates and to_retry:
                    logger.debug("Received new template(s)")
                    logger.debug("Will re-attempt to decode %d old v9/IPFIX ExportPackets", len(to_retry))
                    for p in to_retry:
                        self.input.put(p)
                    to_retry.clear()

                self.output.put(ParsedPacket(pkt.ts, pkt.client, export))
        finally:
            # Only reached when while loop ends
            self.server.shutdown()
            self.server.server_close()

    def stop(self):
        logger.info("Shutting down the NetFlow listener")
        self._shutdown.set()

    def join(self, timeout=None):
        self.thread.join(timeout=timeout)
        super().join(timeout=timeout)


def get_export_packets(host: str, port: int) -> ParsedPacket:
    """A threaded generator that will yield ExportPacket objects until it is killed
    """
    listener = ThreadedNetFlowListener(host, port)
    listener.start()
    try:
        while True:
            yield listener.get()
    finally:
        listener.stop()
        listener.join()
        
async def main(*flows):
    myTool = MysqlOperation()
    await asyncio.gather(
        
        myTool.insertRecords(*flows)
    )

if __name__ == "netflow.collector":
    logger.error("The collector is currently meant to be used as a CLI tool only.")
    logger.error("Use 'python3 -m netflow.collector -h' in your console for additional help.")




        # With every parsed flow a new line is appended to the output file. In previous versions, this was implemented
        # by storing the whole data dict in memory and dumping it regularly onto disk. This was extremely fragile, as
        # it a) consumed a lot of memory and CPU (dropping packets since storing one flow took longer than the arrival
        # of the next flow) and b) broke the exported JSON file, if the collector crashed during the write process,
        # rendering all collected flows during the runtime of the collector useless (the file contained one large JSON
        # dict which represented the 'data' dict).

        # In this new approach, each received flow is parsed as usual, but it gets appended to a gzipped file each time.
        # All in all, this improves in three aspects:InnodbOperation
        # 1. collected flow data is not stored in memory any more
        # 2. received and parsed flows are persisted reliably
        # 3. the disk usage of files with JSON and its full strings as keys is reduced by using gzipped files
        # This also means that the files have to be handled differently, because they are gzipped and not formatted as
        # one single big JSON dump, but rather many little JSON dumps, separated by line breaks.
    flows = []
        #for ts, client, export in get_export_packets("0.0.0.0", 9996):
    record = pd.read_csv("test.csv")
    print(record.head(5))

ModuleNotFoundError: No module named 'aioinflux'

In [13]:
import pandas as pd
record = pd.read_csv("test.csv")
print(record.head(5))

      218.146.20.61 2020-06-28 23:55:43.274 2020-06-29 00:08:50.735     6       63     6905       70
0      108.82.154.57 2020-06-29 00:04:46.736 2020...                                                
1     212.102.35.141 2020-06-29 00:07:17.380 2020...                                                
2    104.214.230.139 2020-06-29 00:02:21.153 2020...                                                
3       154.24.10.58 2020-06-29 00:04:11.048 2020...                                                
4       18.196.98.21 2020-06-28 23:56:58.631 2020...                                                


In [21]:
import csv

with open("test.csv") as f:
    reader = csv.reader(f)
    data = [tuple(row) for row in reader]

print(data[10])

('   138.246.193.20 2020-06-28 23:59:58.701 2020-06-29 00:09:02.980    74       77     3299       48',)


In [17]:
from csv import reader
# open file in read mode
with open('test.csv', 'r') as read_obj:
    # pass the file object to reader() to get the reader object
    csv_reader = reader(read_obj)
    # print(csv_reader)
    # Get all rows of csv from csv_reader object as list of tuples
    list_of_tuples = list(map(tuple, csv_reader))
    # display all rows of csv
    print(list_of_tuples[1])

('    108.82.154.57 2020-06-29 00:04:46.736 2020-06-29 00:08:30.961     9        9      360       12',)


In [39]:
import pandas as ps
df4 = ps.read_csv('test.csv', delim_whitespace=True, names=['srcaddr', 'first', 'last', 'flows', 'packets', 'bytes', 'bps'])
print(df4.head(5))

                                 srcaddr       first          last  flows  \
218.146.20.61   2020-06-28  23:55:43.274  2020-06-29  00:08:50.735      6   
108.82.154.57   2020-06-29  00:04:46.736  2020-06-29  00:08:30.961      9   
212.102.35.141  2020-06-29  00:07:17.380  2020-06-29  00:07:17.380      1   
104.214.230.139 2020-06-29  00:02:21.153  2020-06-29  00:08:34.975     15   
154.24.10.58    2020-06-29  00:04:11.048  2020-06-29  00:04:23.261      1   

                            packets  bytes   bps  
218.146.20.61   2020-06-28     63.0   6905    70  
108.82.154.57   2020-06-29      9.0    360    12  
212.102.35.141  2020-06-29      1.0    134     0  
104.214.230.139 2020-06-29    150.0  88581  1895  
154.24.10.58    2020-06-29      3.0    132    86  


In [7]:
import pandas as ps
df4 = ps.read_csv('test_header.csv', error_bad_lines = False)
print(df4.head(5))

b'Skipping line 298279: expected 1 fields, saw 6\nSkipping line 298281: expected 1 fields, saw 3\n'


        Src IP Addr Date first seen         Date last seen          Proto Flows  Packets    Bytes      bps
0      218.146.20.61 2020-06-28 23:55:43.274 2020...                                                      
1      108.82.154.57 2020-06-29 00:04:46.736 2020...                                                      
2     212.102.35.141 2020-06-29 00:07:17.380 2020...                                                      
3    104.214.230.139 2020-06-29 00:02:21.153 2020...                                                      
4       154.24.10.58 2020-06-29 00:04:11.048 2020...                                                      


In [8]:
df4.head(298278)

Unnamed: 0,Src IP Addr Date first seen Date last seen Proto Flows Packets Bytes bps
0,218.146.20.61 2020-06-28 23:55:43.274 2020...
1,108.82.154.57 2020-06-29 00:04:46.736 2020...
2,212.102.35.141 2020-06-29 00:07:17.380 2020...
3,104.214.230.139 2020-06-29 00:02:21.153 2020...
4,154.24.10.58 2020-06-29 00:04:11.048 2020...
...,...
298273,179.36.147.72 2020-06-29 00:03:14.178 2020...
298274,109.117.39.134 2020-06-29 00:08:12.797 2020...
298275,129.187.61.156 2020-06-29 00:02:34.798 2020...
298276,105.213.96.92 2020-06-29 00:04:22.291 2020...


In [11]:
df4.drop([len(df4)-1],axis=0,inplace=True)

In [12]:
df4

Unnamed: 0,Src IP Addr Date first seen Date last seen Proto Flows Packets Bytes bps
0,218.146.20.61 2020-06-28 23:55:43.274 2020...
1,108.82.154.57 2020-06-29 00:04:46.736 2020...
2,212.102.35.141 2020-06-29 00:07:17.380 2020...
3,104.214.230.139 2020-06-29 00:02:21.153 2020...
4,154.24.10.58 2020-06-29 00:04:11.048 2020...
...,...
298272,95.91.234.111 2020-06-29 00:04:12.096 2020...
298273,179.36.147.72 2020-06-29 00:03:14.178 2020...
298274,109.117.39.134 2020-06-29 00:08:12.797 2020...
298275,129.187.61.156 2020-06-29 00:02:34.798 2020...


In [16]:
df4.names

AttributeError: 'DataFrame' object has no attribute 'names'

In [1]:
from csv import reader
import socket, struct
import re
import argparse
import gzip
import json
import logging
import queue
import socketserver
import threading
import time
import asyncio
import nest_asyncio
nest_asyncio.apply()
from collections import namedtuple


from package.ipfix import IPFIXTemplateNotRecognized
#from package.utils import *
from package.utils import UnknownExportVersion, parse_packet, flow_filter_v4, flow_filter_v6
from package.v9 import V9TemplateNotRecognized
from package.mysql_os import MysqlOperation
#from package.influxdb_os import InsertRecords

myTool = MysqlOperation()

# open file in read mode
with open('test.csv', 'r') as read_obj:
    flow_list = []
    row = ()
    for line in read_obj:
        line = line.replace("\n", "")
        row = line.split(' ')
        
        row = list(filter(None, row))

        #row = row[1].join(row[2])
        if(len(row) == 9):
            (srcaddr, first_Y_M_D, first_H_M_S,last_Y_M_D, last_H_M_S, flows, packets, byte, bps) = row
            try:
                socket.inet_aton(srcaddr)
                srcaddr = struct.unpack('!L', socket.inet_aton(srcaddr))[0]
            except socket.error:
                continue
                # srcaddr = struct.unpack('!QQ', socket.inet_pton(socket.AF_INET6, srcaddr))[0]
            first = first_Y_M_D + first_H_M_S
            first = re.split("-|:|\.",first)
            first = int("".join(first[0:5]))
            last = last_Y_M_D+last_H_M_S
            last = re.split("-|:|\.",last)
            last = int("".join(last[0:5]))
            flows = int(flows)
            packets = int(float(packets))
            byte = int(float(byte))
            bps = int(float(bps))
            row = (srcaddr, first, last, flows, packets, byte, bps) 
        elif(len(row) == 10):
            (srcaddr, first_Y_M_D, first_H_M_S,last_Y_M_D, last_H_M_S, flows, packets, byte, byte_unit, bps) = row
            try:
                socket.inet_aton(srcaddr)
                srcaddr = struct.unpack('!L', socket.inet_aton(srcaddr))[0]
            except socket.error:
                continue
                # srcaddr = struct.unpack('!QQ', socket.inet_pton(socket.AF_INET6, srcaddr))[0]
            first = first_Y_M_D + first_H_M_S
            first = re.split("-|:|\.",first)
            first = int("".join(first[0:5]))
            last = last_Y_M_D+last_H_M_S
            last = re.split("-|:|\.",last)
            last = int("".join(last[0:5]))
            flows = int(flows)
            packets = int(float(packets))
            if(byte_unit == 'M'):
                byte = int(float(byte)) * 10**6
            elif(byte_unit == 'G'):
                byte = int(float(byte)) * 10**9
            elif(bps == 'M'): 
                bps = byte_unit * 10**6
            elif(bps == 'G'): 
                bps = byte_unit * 10**9
            row = (srcaddr, first, last, flows, packets, byte, bps) 
        elif(len(row) == 11):
            (srcaddr, first_Y_M_D, first_H_M_S,last_Y_M_D, last_H_M_S, flows, packets, byte, byte_unit, bps, bps_unit) = row
            try:
                socket.inet_aton(srcaddr)
                srcaddr = struct.unpack('!L', socket.inet_aton(srcaddr))[0]
            except socket.error:
                continue
                # srcaddr = struct.unpack('!QQ', socket.inet_pton(socket.AF_INET6, srcaddr))[0]
            first = first_Y_M_D + first_H_M_S
            first = re.split("-|:|\.",first)
            first = int("".join(first[0:5]))
            last = last_Y_M_D+last_H_M_S
            last = re.split("-|:|\.",last)
            last = int("".join(last[0:5]))
            flows = int(flows)
            if(byte == 'M'):
                packets = int(float(packets)) * 10**6
                if(bps == 'M'):
                    byte = int(float(byte_unit)) * 10**6
                elif(bps == 'G'):
                    byte = int(float(byte_unit)) * 10**9
                bps = bps_unit                    
            elif(byte == 'G'):
                packets = int(float(packets)) * 10**9
                if(bps == 'M'):
                    byte = int(float(byte_unit)) * 10**6
                elif(bps == 'G'):
                    byte = int(float(byte_unit)) * 10**9
                bps = bps_unit   
            else:    
                packets = int(float(packets))
                if(byte_unit == 'M'):
                    byte = int(float(byte)) * 10**6
                elif(byte_unit == 'G'):
                    byte = int(float(byte)) * 10**9
                if(bps_unit == 'M'):
                    bps = int(float(bps)) * 10**6
                elif(bps_unit == 'M'):
                    bps = int(float(bps)) * 10**9
            row = (srcaddr, first, last, flows, packets, byte, bps) 
        elif(len(row) == 12):
            (srcaddr, first_Y_M_D, first_H_M_S,last_Y_M_D, last_H_M_S, flows, packets, packets_unit, byte, byte_unit, bps, bps_unit) = row
            try:
                socket.inet_aton(srcaddr)
                srcaddr = struct.unpack('!L', socket.inet_aton(srcaddr))[0]
            except socket.error:
                continue
                # srcaddr = struct.unpack('!QQ', socket.inet_pton(socket.AF_INET6, srcaddr))[0]
            first = first_Y_M_D + first_H_M_S
            first = re.split("-|:|\.",first)
            first = int("".join(first[0:5]))
            last = last_Y_M_D+last_H_M_S
            last = re.split("-|:|\.",last)
            last = int("".join(last[0:5]))
            flows = int(float(flows))
            if(packets_unit == 'M'):
                packets = int(float(packets)) * 10**6
            elif(packets_unit == 'G'):
                packets = int(float(packets)) * 10**9
            if(byte_unit == 'M'):
                byte = int(float(byte)) * 10**6
            elif(byte_unit == 'G'):
                byte = int(float(byte)) * 10**9
            if(bps_unit == 'M'):
                bps = int(float(bps)) * 10**6
            elif(bps_unit == 'M'):
                bps = int(float(bps)) * 10**9
            row = (srcaddr, first, last, flows, packets, byte, bps) 
        flow_list.append(row)
        if(len(flow_list) == 10000):
            
            asyncio.run(asyncio.gather(myTool.insertRecords(*flow_list)))
            flow_list = []
            # asyncio.run(asyncio.gather(myTool.insertRecords(*flow_list), InsertRecords(*flow_list)))
            # print(flow_list)
            # break
    

start to connect db! 
succeed to connect db!


  await self._query(query)


execute insert cost: 
insert res: 10000
close pool!
start to connect db! 
succeed to connect db!
execute insert cost: 
insert res: 10000
close pool!
start to connect db! 
succeed to connect db!
execute insert cost: 
insert res: 10000
close pool!
start to connect db! 
succeed to connect db!
execute insert cost: 
insert res: 10000
close pool!
start to connect db! 
succeed to connect db!
execute insert cost: 
insert res: 10000
close pool!
start to connect db! 
succeed to connect db!
execute insert cost: 
insert res: 10000
close pool!
start to connect db! 
succeed to connect db!
execute insert cost: 
insert res: 10000
close pool!
start to connect db! 
succeed to connect db!
execute insert cost: 
insert res: 10000
close pool!
start to connect db! 
succeed to connect db!
execute insert cost: 
insert res: 10000
close pool!
start to connect db! 
succeed to connect db!
execute insert cost: 
insert res: 10000
close pool!
start to connect db! 
succeed to connect db!
execute insert cost: 
insert r

In [60]:
test = "2020-06-2823:55:43.274"
test = re.split("-|:|\.",first_1)
test = "".join(test[0:5])

In [61]:
test

'20200628235543'

In [66]:
test = '218.146.20.61'
socket.inet_aton(test)

b'\xda\x92\x14='

In [70]:
ipv6 = '2804:a8..8::1392'
try:
    socket.inet_aton(ipv6)
    print("1")
except socket.error:
    print("2")

2


In [78]:
from binascii import hexlify
import ipaddress
ipv6 = '2804:a8..8::1392'

int(hexlify(socket.inet_pton(socket.AF_INET6, ipv6)), 16)


OSError: illegal IP address string passed to inet_pton

In [4]:
!pip install aioinflux

Collecting aioinflux
  Using cached aioinflux-0.9.0-py3-none-any.whl (16 kB)
Collecting ciso8601
  Using cached ciso8601-2.1.3.tar.gz (15 kB)
Building wheels for collected packages: ciso8601
  Building wheel for ciso8601 (setup.py) ... [?25ldone
[?25h  Created wheel for ciso8601: filename=ciso8601-2.1.3-cp37-cp37m-linux_x86_64.whl size=29068 sha256=36c8f014d621f47b7c676c15a21bf3f6358e022fc8020dcf855cdf7f554b1e98
  Stored in directory: /home/yuhao/.cache/pip/wheels/96/0f/89/b1c8e876a1c8ebf41226adea77b12c4540ffc323006124954d
Successfully built ciso8601
Installing collected packages: ciso8601, aioinflux
Successfully installed aioinflux-0.9.0 ciso8601-2.1.3


Collecting ciso8601
  Using cached ciso8601-2.1.3.tar.gz (15 kB)
Building wheels for collected packages: ciso8601
  Building wheel for ciso8601 (setup.py) ... [?25lerror
[31m  ERROR: Command errored out with exit status 1:
   command: /home/yuhao/environments/py37-venv/bin/python3.7 -u -c 'import sys, setuptools, tokenize; sys.argv[0] = '"'"'/tmp/pip-install-5lcwteus/ciso8601/setup.py'"'"'; __file__='"'"'/tmp/pip-install-5lcwteus/ciso8601/setup.py'"'"';f=getattr(tokenize, '"'"'open'"'"', open)(__file__);code=f.read().replace('"'"'\r\n'"'"', '"'"'\n'"'"');f.close();exec(compile(code, __file__, '"'"'exec'"'"'))' bdist_wheel -d /tmp/pip-wheel-blww2gdu
       cwd: /tmp/pip-install-5lcwteus/ciso8601/
  Complete output (18 lines):
  running bdist_wheel
  running build
  running build_py
  package init file 'ciso8601/__init__.py' not found (or not a regular file)
  creating build
  creating build/lib.linux-x86_64-3.7
  creating build/lib.linux-x86_64-3.7/ciso8601
  copying ciso8601/__init__