In [None]:
import numpy as np
import pandas as pd
import numpy.random as npr
import json

1\. **Text files**

Perform the following operations on plain `txt` files:

+ create a list of integrer numbers and then save it to a text file named `data_int.txt`. Run the `cat` command to print the content of the file.
+ create a matrix of 5x5 floats and then save it to a text file named `data_float.txt`. Use the `cat` command to print the content of the file.
+ load the `txt` file of the previous point and convert it to a `csv` file by hand.

In [None]:
def convertToCSV(srcFileName, destFileName):
    with open(srcFileName, 'r') as inFile, open(destFileName, 'w') as outFile:
        line = inFile.read()
        line = line.replace(" ", ",")
        outFile.write(line)

# create the list
anIntList = [x for x in range(10)]
aFloatList = np.linspace(0,1,25).reshape(5,5)
intFileName = "data/data_int.txt"
floatFileName = "data/data_float.txt"

# saving to file
with open(intFileName, 'w') as outFile:
    for n in range(len(anIntList)-1): outFile.write(str(anIntList[n]) + ' ')
    outFile.write(str(anIntList[-1]))
    
with open(floatFileName, 'w') as outFile:
    for n in range(len(aFloatList)):
        for m in range(len(aFloatList[n]) -1):
            outFile.write(str(aFloatList[n][m]) + ' ')
        outFile.write(str(aFloatList[n][-1]))
        outFile.write("\n")

# printing the two files
print("integer list from file:")
!cat ./data/data_int.txt
print()
print("\nfloat matrix from file:")
!cat ./data/data_float.txt

# converting to CSV and printing the results
convertToCSV(intFileName, "data_int.csv")
convertToCSV(floatFileName, "data_float.csv")
print("\ninteger CSV from file:")
!cat ./data/data_int.csv
print()
print("\nfloat CSV from file:")
!cat ./data/data_float.csv

2\. **JSON files**

Load the file `user_data.json`, which can be found at:

- https://www.dropbox.com/s/sz5klcdpckc39hd/user_data.json

and filter the data by the "CreditCardType" when it equals to "American Express". Than save the data to a new CSV file.

In [None]:
#!wget https://www.dropbox.com/s/sz5klcdpckc39hd/user_data.json -P ./data
#!cat data/user_data.json
with open('./data/user_data.json', 'r') as f:
    data= pd.read_json(f)
amex = data[data['CreditCardType'] == "American Express"]
amex.to_csv('./data/amex_customers.csv')

#!cat ./data/amex_customers.csv

3\. **CSV files with Pandas**

Load the file from this url:

- https://www.dropbox.com/s/kgshemfgk22iy79/mushrooms_categorized.csv

with Pandas. 

+ explore and print the DataFrame
+ calculate, using `groupby()`, the average value of each feature, separately for each class
+ save the file in a JSON format.

In [None]:
#!wget https://www.dropbox.com/s/kgshemfgk22iy79/mushrooms_categorized.csv -P data/
with open("data/mushrooms_categorized.csv", 'r') as fin:
    data = pd.read_csv(fin)
data = data.groupby('class').mean()
data.to_json("data/mushroom_recap.json")

4\. **Reading a database**

Get the database `sakila.db` from the lecture `06_dataio.ipynb`, and import the table `actors` as a Pandas dataframe. Using the dataframe, count how many actors have a first name that begins with `A`.

*Hint:* use the Series `.str` method to apply the Python string methods to the elements of a Series, see [documentation](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.html).

5\. **Reading the credit card numbers**

Get the binary file named `credit_card.dat` from this address:

- https://www.dropbox.com/s/8m0syw2tkul3dap/credit_card.dat

and convert the data into the real credit card number, knowing that:
- each line corresponds to a credit card number, which consists of 16 characters (which are numbers in the 0-9 range) divided in 4 blocks, with a whitespace between each block
- each character is written using a 6 bit binary representation (including the whitespace)
- the final 4 bits of each line are a padding used to determine the end of the line, and can be ignored

*Hint*: convert the binary numbers to the decimal representation first, and then use the `chr()` function to convert the latter to a char

In [None]:
#!wget https://www.dropbox.com/s/8m0syw2tkul3dap/credit_card.dat -P data/
#!hexdump data/credit_card.dat

with open('data/credit_card.dat', 'rb') as fin:
    i = 1
    current = ""
    while True:
        if(i == 20):
            fin.read(5)
            print(current)
            current = ""
            i = 1
        else:
            chunk = fin.read(6)
            if len(chunk) == 0:
                break
            current = current + str(chr(int(chunk, 2)))
            i += 1



6\. **Write data to a binary file**

a) Start from the `data/data_000637.txt` file that we have used during the previous lectures, and convert it to a binary file according to the format defined below:

In [None]:
from IPython.display import Image
Image("images/data_format.png")

*Hints*:
- Read the first 10 lines using Pandas
- Iterate over the DataFrame rows
- For every row, "pack" the values (features) into a single 64-bit word, according to the format specified above. Use bit-wise shifts and operators to do so.
- Write each 64-bit word to a binary file. You can use `struct` in this way:
```
binary_file.write( struct.pack('<q', word) )
```
where `word` is the 64-bit word.
- Close the file after completing the loop.

b) Check that the binary file is correctly written by reading it with the code used in the lecture `06_dataio.ipynb`, and verify that the content of the `txt` and binary files is consistent.

c) What is the difference of the size on disk between equivalent `txt` and binary files?

In [None]:
#!wget https://www.dropbox.com/s/ga9wi6b40cakgae/data_000637.txt -P data/

import struct


dataIn = pd.read_csv('./data/data_000637.txt', nrows=11)
print("the original text file reads as follows: \n")
print(dataIn)

# writing the data as text file for comparison
with open('data/text_file.txt', mode='w') as ftxt:
            ftxt.write(str(dataIn))

with open('data/binary_file.dat','wb')as fbin:
     for line in dataIn.values:
        word = line[0] << 62   #index
        word += line[1] << 58  #FPGA number
        word += line[2] << 49  #TDC channel
        word += line[3] << 17  #orbit CNT
        word += line[4] << 5   #BX counter
        word += line[5] << 0   #TDC meas
        fbin.write(struct.pack('<q',word))

# this is the code from the lesson to read the data file, it should (and does) have the same output as
# the one above

import struct, time

data = {}

with open('data/binary_file.dat', 'rb') as file:
    file_content = file.read()
    word_counter = 0
    word_size = 8 # size of the word in bytes
    for i in range(0, len(file_content), word_size):
        word_counter += 1
        word = struct.unpack('<q', file_content[i : i + word_size])[0] # get an 8-byte word
        head     = (word >> 62) & 0x3
        fpga     = (word >> 58) & 0xF
        tdc_chan = (word >> 49) & 0x1FF
        orb_cnt  = (word >> 17) & 0xFFFFFFFF
        bx       = (word >> 5 ) & 0xFFF
        tdc_meas = (word >> 0 ) & 0x1F
        #if i == 0: print ('{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format('HEAD', 'FPGA', 'CHANNEL', 'ORBIT_CNT', 'BX_CNT', 'TDC_MEAS'))
        #print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(head, fpga, tdc_chan, orb_cnt, bx, tdc_meas))
        entry = {'HEAD' : head, 'FPGA' : fpga, 'CHANNEL' : tdc_chan, 'ORBIT_CNT' : orb_cnt, 'BX_CNT' : bx, 'TDC_MEAS' : tdc_meas}
        #df = df.append(entry, ignore_index=True)
        data[word_counter] = entry

print("\n\nsaving the file as binary and subsequently reading it with the code presented during the lecture, we obtain:\n")
df = pd.DataFrame(data).T
print(df)

print("\nas we can see, both files read correctly.")


In [None]:
from pathlib import Path
print("text file size:",Path('data/text_file.txt').stat().st_size)
print("bin file size:",Path('data/binary_file.dat').stat().st_size)