# Operacje I/O

## Managery kontekstu

## praca z plikami

### Przykład optymalizacji - zwrócenie długości pliku.

In [9]:
filename = "data/PESEL_NAZWISKA.csv"  # 298127 linii

def simplecount(filename):
    lines=0
    with open(filename) as f:
        for line in f:
            lines += 1
    return lines

simplecount(filename)

def simplecount2(filename):
    with open(filename) as f:
        lines = sum(1 for line in f)
    return lines

assert simplecount2(filename) == 298127

def enumerate_count(fname):
    with open(fname) as f:
        for i, l in enumerate(f):
            pass
    return i + 1

assert enumerate_count(filename) == 298127

def mapcount(filename):
    lines = 0
    with open(filename) as f:
        buf = mmap.mmap(f.fileno(), 0, prot=mmap.PROT_READ)
        readline = buf.readline
        while readline():
            lines += 1
    return lines

assert mapcount(filename) == 298127

def bufcount(filename):
    lines=0
    buf_size = 1024 * 1024
  
    with open(filename) as f:
        lines = 0
        read_f = f.read # loop optimization
        buf = read_f(buf_size)
        while buf:
            lines += buf.count('\n')
            buf = read_f(buf_size)
    return lines

assert bufcount(filename) == 298127

import numpy as np

def numpy_count(filename):
    my_data = np.genfromtxt(filename, delimiter=',')
    return my_data.shape[0]

assert bufcount(filename) == 298127


import pandas as pd

def pd_read(filename):
    df = pd.read_csv(filename)
    return df.shape[0] + 1

assert pd_read(filename) == 298127

import time
import mmap
import random
from collections import defaultdict


methods = [
    simplecount,
    simplecount2,
    enumerate_count,
    mapcount,
    bufcount,
    numpy_count,
    pd_read, 
    

]
counts = defaultdict(list)

for i in range(5):
    for func in methods:
        start_time = time.time()
        assert func("data/PESEL_NAZWISKA.csv") == 298127
        counts[func].append(time.time() - start_time)

for key, vals in counts.items():
    print(key.__name__, ":", sum(vals) / float(len(vals)))

NameError: name 'mmap' is not defined

### Praca z dużymi plikam

## Bazy danych

### Postgresql

In [7]:
# pip install psycopg2
# !which pip

Dane do połączenia z bazą danych

In [1]:
connection_data = dict(
    user="doadmin",
    password="mo8ishar2150lq59",
    host="db-kpmg-tests-do-user-2498346-0.db.ondigitalocean.com",
    port=25060,
    database="defaultdb",
    sslmode="require",
)

In [2]:
import psycopg2   # biblioteka umożliwiająca połączenie
try:
    connection = psycopg2.connect(**connection_data)  # nawiązywanie połączenia
    cursor = connection.cursor()           # tworzenie kursora

    # Print PostgreSQL Connection properties
    print(connection.get_dsn_parameters(), "\n")  # wypisanie parametrów

    # Print PostgreSQL version
    cursor.execute("SELECT version();")           # wykonanie skryptu SQL 
    record = cursor.fetchone()                    # pobranie wyniku
    print("You are connected to - ", record, "\n")

except (Exception, psycopg2.Error) as error:
    print("Error while connecting to PostgreSQL", error)
finally:
    # Zamknięcie kursora i połączenia.
    if(connection):
        cursor.close()
        connection.close()
        print("PostgreSQL connection is closed")

{'user': 'doadmin', 'dbname': 'defaultdb', 'host': 'db-kpmg-tests-do-user-2498346-0.db.ondigitalocean.com', 'port': '25060', 'tty': '', 'options': '', 'sslmode': 'require', 'sslcompression': '0', 'krbsrvname': 'postgres', 'target_session_attrs': 'any'} 

You are connected to -  ('PostgreSQL 11.6 on x86_64-pc-linux-gnu, compiled by gcc, a 863367dcd p 310a24bd77, 64-bit',) 

PostgreSQL connection is closed


In [77]:
with psycopg2.connect(**connection_data) as conn:
    with conn.cursor() as cursor:
        cursor.execute("SELECT version();")
        record = cursor.fetchone()
        print("You are connected to - ", record, "\n")

You are connected to -  ('PostgreSQL 11.6 on x86_64-pc-linux-gnu, compiled by gcc, a 863367dcd p 310a24bd77, 64-bit',) 



Utworzenie tabeli

In [78]:
create_table_sql = """
CREATE TABLE surnames (
   id SERIAL PRIMARY KEY,
   surname VARCHAR (250) UNIQUE NOT NULL,
   count INTEGER NOT NULL
);
"""


with psycopg2.connect(**connection_data) as conn:
    with conn.cursor() as cursor:
        cursor.execute(create_table_sql)
        conn.commit()

In [81]:
sql = "SELECT * FROM surnames;"

with psycopg2.connect(**connection_data) as conn:
    with conn.cursor() as cursor:
        cursor.execute(sql)
        record = cursor.fetchall()
print(record)

[]


Dodajmy do bazy trochę danych. Wrzucimy tam informacje o częstotliwości występowania nazwisk w systemie PESEL.

Na początku trzeba zbadać same dane. By przeczytać tylko kilka pierwszych linii możemy wykorzystać funkcje next(), albo np islice z itertools.


In [82]:
import csv

ile linii ma plik?


In [102]:
# \xa0 - no break space
with open("data/PESEL_NAZWISKA.csv") as f:
    head = [next(f) for n in range(10)]
    print(head)

['Nazwisko,Liczba wystąpień\n', 'NOWAK,139\xa0325\n', 'KOWALSKA,89\xa0885\n', 'WIŚNIEWSKA,70\xa0701\n', 'WÓJCIK,63\xa0894\n', 'KOWALCZYK,62\xa0407\n', 'KAMIŃSKA,60\xa0965\n', 'LEWANDOWSKA,60\xa0468\n', 'DĄBROWSKA,59\xa0520\n', 'ZIELIŃSKA,58\xa0929\n']


In [92]:
from itertools import islice

with open("data/PESEL_NAZWISKA.csv") as f:
    head = list(islice(f, 10))
    print(head)

['Nazwisko,Liczba wystąpień\n', 'NOWAK,139\xa0325\n', 'KOWALSKA,89\xa0885\n', 'WIŚNIEWSKA,70\xa0701\n', 'WÓJCIK,63\xa0894\n', 'KOWALCZYK,62\xa0407\n', 'KAMIŃSKA,60\xa0965\n', 'LEWANDOWSKA,60\xa0468\n', 'DĄBROWSKA,59\xa0520\n', 'ZIELIŃSKA,58\xa0929\n']


In [94]:
import mmap
mmap.mmap?

In [70]:
with open("data/PESEL_NAZWISKA.csv") as f:
    data = csv.reader(f, delimiter=',')
    columns = next(data)
    for d in data:
        print(d[0], int(d[1].replace(u"\xa0","")))
        break

NOWAK 139325


### SQLite

### zadanie

Odczytaj dane - np z pliku, API, lub innej bazy danych i zapisz je w bazie dane.sqlite

