In [40]:
import numpy as np
from os import system
import os
import pandas as pd
import json
import csv
import mysql.connector
import sqlite3 as sql
import requests
import struct, time

1\. **Text files**

Perform the following operations on plain `txt` files:

+ create a list of integrer numbers and then save it to a text file named `data_int.txt`. Run the `cat` command to print the content of the file.
+ create a matrix of 5x5 floats and then save it to a text file named `data_float.txt`. Use the `cat` command to print the content of the file.
+ load the `txt` file of the previous point and convert it to a `csv` file by hand.

In [41]:
def write_to_file(filename, mode, data):
    with open(filename, mode) as file:
        file.write(data)

def convert_txt_csv(filename, index_status):
    df =  pd.read_fwf(filename)
    csv_filename = filename.replace('.txt', '.csv') 
    df.to_csv(csv_filename, index = index_status)
      
int_lst = np.random.randint(low=1, high=20, size=10)

# Random float values between 0 and 4
float_matrix = np.round(np.random.rand(5, 5), 4) * 4

write_to_file('data_int.txt', 'w', str(int_lst))
write_to_file('data_float.txt', 'w', np.array2string(float_matrix))

print("The output from the cat command for the data_int.txt: ")
system("cat data_int.txt")
print('\n')
print("The output from the cat command for the data_float.txt: ")
system("cat data_float.txt")

convert_txt_csv('data_float.txt', index_status = False)

The output from the cat command for the data_int.txt: 
[ 9 10  6 16  5 10 13 10 11 16]

The output from the cat command for the data_float.txt: 
[[1.9184 3.0096 2.472  1.9156 0.7668]
 [1.7576 0.7072 1.8524 0.1792 3.8568]
 [1.0088 1.1332 3.2732 3.3504 1.832 ]
 [0.824  2.9168 3.4532 1.3344 3.5292]
 [3.46   2.3616 2.9088 0.5652 0.9016]]

2\. **JSON files**

Load the file `user_data.json`, which can be found at:

- https://www.dropbox.com/s/sz5klcdpckc39hd/user_data.json

and filter the data by the "CreditCardType" when it equals to "American Express". Than save the data to a new CSV file.

In [42]:
def filter_and_save_csv(filename = str, wanted_info_type = str, filter_by = str):
    with open(filename) as json_file:
        data = json.load(json_file)
    
    filtered_data = [user for user in data if user.get(wanted_info_type) == filter_by]

    csv_filename = filename.replace('.json', '.csv') 
    with open(csv_filename, 'w', newline='') as data_csv_file:
        csv_writer = csv.writer(data_csv_file)
        
        # Write headers only once
        if filtered_data:
            header = filtered_data[0].keys()
            csv_writer.writerow(header)
        
        # Write filtered data to CSV
        for user in filtered_data:
            csv_writer.writerow(user.values())

filter_and_save_csv('user_data.json', 'CreditCardType', 'American Express')


3\. **CSV files with Pandas**

Load the file from this url:

- https://www.dropbox.com/s/kgshemfgk22iy79/mushrooms_categorized.csv

with Pandas. 

+ explore and print the DataFrame
+ calculate, using `groupby()`, the average value of each feature, separately for each class
+ save the file in a JSON format.

In [43]:
url = "https://www.dropbox.com/s/kgshemfgk22iy79/mushrooms_categorized.csv?dl=1" #?dl=1 is essential to get direct download
df_from_url = pd.read_csv(url)

def explore_dataframe(dataframe):
    print(f'The whole dataset: \n {dataframe} \n')
    print(f'The statistical analysis of dataset: \n {dataframe.describe()} \n')
    print(f'The columns of dataset: \n {dataframe.columns} \n')

def average_value_features(dataframe):
    average_features = dataframe.groupby(['class']).mean()
    return average_features

def csv_to_json(dataframe, json_filename = str):
    dataframe.to_json(r'/Users/RikkeNielsen/ScientificComputingWithPython2023/' + json_filename)

explore_dataframe(df_from_url)
average_value_features(df_from_url)
csv_to_json(df_from_url, 'mushrooms.json')

The whole dataset: 
       class  cap-shape  cap-surface  cap-color  bruises  odor  \
0         1          5            2          4        1     6   
1         0          5            2          9        1     0   
2         0          0            2          8        1     3   
3         1          5            3          8        1     6   
4         0          5            2          3        0     5   
...     ...        ...          ...        ...      ...   ...   
8119      0          3            2          4        0     5   
8120      0          5            2          4        0     5   
8121      0          2            2          4        0     5   
8122      1          3            3          4        0     8   
8123      0          5            2          4        0     5   

      gill-attachment  gill-spacing  gill-size  gill-color  ...  \
0                   1             0          1           4  ...   
1                   1             0          0           4  ... 

4\. **Reading a database**

Get the database `sakila.db` from the lecture `06_dataio.ipynb`, and import the table `actors` as a Pandas dataframe. Using the dataframe, count how many actors have a first name that begins with `A`.

*Hint:* use the Series `.str` method to apply the Python string methods to the elements of a Series, see [documentation](https://pandas.pydata.org/docs/reference/api/pandas.Series.str.html).

In [44]:
url_database = "https://gist.github.com/Piyush3dB/726bf7012785d6e0fd691c3655c92654/raw/2c17ccb2eb33b3396bfa96284c53f0718a4ea62c/sakila.db?dl=1"

response = requests.get(url_database)
with open('sakila.db', 'wb') as file:
    file.write(response.content)

# create a connection to the database and a cursor to execute queries
conn = sql.connect('sakila.db')
cur = conn.cursor()

# query data from database: select all content from the table "actor"
query = "SELECT * FROM actor"
results = cur.execute(query).fetchall()

# create a DataFrame from the DB data
df = pd.DataFrame(results)

# close the cursor and connection
cur.close()
conn.close()

count_name_start_with_A = df[1].str.startswith('A').sum()
print(f'Actors that have a firstname which starts with A: \n {count_name_start_with_A} \n')
# print dataframe
df


Actors that have a firstname which starts with A: 
 13 



Unnamed: 0,0,1,2,3
0,1,PENELOPE,GUINESS,2019-02-16 18:17:33
1,2,NICK,WAHLBERG,2019-02-16 18:17:33
2,3,ED,CHASE,2019-02-16 18:17:33
3,4,JENNIFER,DAVIS,2019-02-16 18:17:33
4,5,JOHNNY,LOLLOBRIGIDA,2019-02-16 18:17:33
...,...,...,...,...
195,196,BELA,WALKEN,2019-02-16 18:17:33
196,197,REESE,WEST,2019-02-16 18:17:33
197,198,MARY,KEITEL,2019-02-16 18:17:33
198,199,JULIA,FAWCETT,2019-02-16 18:17:33


5\. **Reading the credit card numbers**

Get the binary file named `credit_card.dat` from this address:

- https://www.dropbox.com/s/8m0syw2tkul3dap/credit_card.dat

and convert the data into the real credit card number, knowing that:
- each line corresponds to a credit card number, which consists of 16 characters (which are numbers in the 0-9 range) divided in 4 blocks, with a whitespace between each block
- each character is written using a 6 bit binary representation (including the whitespace)
- the final 4 bits of each line are a padding used to determine the end of the line, and can be ignored

*Hint*: convert the binary numbers to the decimal representation first, and then use the `chr()` function to convert the latter to a char

In [45]:
#shape of data:
'''
111111 = 1 character
one credit card number: 111111000000111111000000111111000000111111000000111111000000111111000000111111000000111111000000 (16 characters) + 1010 (padding)
whitespace between each block
'''

def get_binary_file(url = str, filename = str):
    response = requests.get(url)
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        print("File downloaded successfully.")

def read_bin_file(filename):
    with open(filename, 'rb') as f:
        return f.read()
            
def conv_bin_to_decimal(bin_data, char_size):
    credit_card_nums = []
    for line in bin_data.splitlines():
        credit_card_num = ''
        for i in range(0, len(line), char_size):
            num = int(line[i:i+char_size],2)
            credit_card_num += chr(num)
        credit_card_nums.append(credit_card_num.strip())
    return credit_card_nums

url = "https://www.dropbox.com/s/8m0syw2tkul3dap/credit_card.dat?dl=1"
#get_binary_file(url, 'credit_card.dat')
binary_data = read_bin_file('credit_card.dat')

credit_card_nums = conv_bin_to_decimal(binary_data, 6)
print(credit_card_nums)


['7648 5673 3775 2271', '3257 8247 3354 2266', '2722 0001 4011 6652', '0661 3063 3742 3150', '0432 1608 1462 4742', '5827 2027 8785 7303', '5774 8528 2087 1117', '8140 1210 6352 2845', '5764 1133 7301 7100', '6456 1737 4126 6726', '1228 8631 7382 0000', '7051 0160 5374 3166', '0618 3587 1630 6376', '1545 5454 7444 5636', '6735 3116 3202 6834', '7287 5011 1547 8413', '7033 2607 3328 4200', '2568 5244 1874 5024', '1684 2253 7570 7118', '0672 2576 0575 6631', '6332 8353 8787 1340', '1813 3361 1175 4211', '2477 6450 8840 2368', '5512 3505 2563 1326', '3083 7882 0621 0025', '4521 5148 8045 0334', '7563 3654 8713 5787', '8324 2664 0476 5561', '0565 2504 7168 3510', '5107 5507 1767 0738', '2462 1821 2448 1443', '2788 0638 6861 6554', '5851 5873 5474 0547', '0670 1004 4013 2655', '5874 5506 3048 0806', '2805 5401 8462 1260', '5083 8406 6310 1862', '1076 1445 3013 2266', '8440 4804 4844 5277', '4758 6141 0686 1387', '7586 0675 0315 2568', '2544 1258 7432 5165', '3474 5023 4434 5626', '1410 0270

6\. **Write data to a binary file**

a) Start from the `data/data_000637.txt` file that we have used during the previous lectures, and convert it to a binary file according to the format defined below:

In [46]:
#from IPython.display import Image
#Image("images/data_format.png")

*Hints*:
- Read the first 10 lines using Pandas
- Iterate over the DataFrame rows
- For every row, "pack" the values (features) into a single 64-bit word, according to the format specified above. Use bit-wise shifts and operators to do so.
- Write each 64-bit word to a binary file. You can use `struct` in this way:
```
binary_file.write( struct.pack('<q', word) )
```
where `word` is the 64-bit word.
- Close the file after completing the loop.

b) Check that the binary file is correctly written by reading it with the code used in the lecture `06_dataio.ipynb`, and verify that the content of the `txt` and binary files is consistent.

c) What is the difference of the size on disk between equivalent `txt` and binary files?

In [47]:
data_txt = pd.read_csv('data_000637.txt', nrows=10)

with open('data_000637.dat', 'wb') as binary_file:
    # Iterate over the DataFrame rows
    for index, row in data_txt.iterrows():
        word = (
            int(row['HEAD']) << 62 |
            int(row['FPGA']) << 58 |
            int(row['TDC_CHANNEL']) << 49 |
            int(row['ORBIT_CNT']) << 17 |  
            int(row['BX_COUNTER']) << 5 |  
            int(row['TDC_MEAS']) << 0  
        )
        # Write the 64-bit word to the binary file
        binary_file.write(struct.pack('<q', word))  # Use 'Q' for unsigned long long (64-bit)
    binary_file.close()

data = {}

with open('data_000637.dat', 'rb') as file:
    file_content = file.read()
    word_counter = 0
    word_size = 8 # size of the word in bytes
    for i in range(0, len(file_content), word_size):
        word_counter += 1
        if word_counter > 10: break
        word = struct.unpack('<q', file_content[i : i + word_size])[0] # get an 8-byte word
        head     = (word >> 62) & 0x3
        fpga     = (word >> 58) & 0xF
        tdc_chan = (word >> 49) & 0x1FF
        orb_cnt  = (word >> 17) & 0xFFFFFFFF
        bx       = (word >> 5 ) & 0xFFF
        tdc_meas = (word >> 0 ) & 0x1F
        #if i == 0: print ('{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format('HEAD', 'FPGA', 'CHANNEL', 'ORBIT_CNT', 'BX_CNT', 'TDC_MEAS'))
        #print('{0}\t{1}\t{2}\t{3}\t{4}\t{5}'.format(head, fpga, tdc_chan, orb_cnt, bx, tdc_meas))
        entry = {'HEAD' : head, 'FPGA' : fpga, 'CHANNEL' : tdc_chan, 'ORBIT_CNT' : orb_cnt, 'BX_CNT' : bx, 'TDC_MEAS' : tdc_meas}
        #df = df.append(entry, ignore_index=True)
        data[word_counter] = entry
        
text_file = 'data_000637.txt'

bin_file = 'data_000637.dat'

text_size = os.path.getsize(text_file)

bin_size = os.path.getsize(bin_file)

size_diff = text_size - bin_size

print(f"Size of text file: {text_size} bytes")
print(f"Size of binary file: {bin_size} bytes")
print(f"Difference in size on disk: {size_diff} bytes")

#The binary file takes up much less space than the txt file

df = pd.DataFrame(data).T
df


Size of text file: 33179236 bytes
Size of binary file: 80 bytes
Difference in size on disk: 33179156 bytes


Unnamed: 0,HEAD,FPGA,CHANNEL,ORBIT_CNT,BX_CNT,TDC_MEAS
1,1,0,123,3869200167,2374,26
2,1,0,124,3869200167,2374,27
3,1,0,63,3869200167,2553,28
4,1,0,64,3869200167,2558,19
5,1,0,64,3869200167,2760,25
6,1,0,63,3869200167,2762,4
7,1,0,61,3869200167,2772,14
8,1,0,139,3869200167,2776,0
9,1,0,62,3869200167,2774,21
10,1,0,60,3869200167,2788,7
