In [683]:
from ds_utils.ds_preamble import *
from ds_utils.ds_plotting import *
from ds_utils.ds_helper import *
import datetime
import duckdb
from glob import glob
from pathlib import Path
from collections import Counter
import sys, subprocess
import time
import re

# class methods


In [292]:
os.chdir('/') # if we want to change to other directories, we still need to use `os.chdir`!
Path.cwd() # os.get_cwd
!pwd
Path.home()

os.chdir(Path.home()) # can directly give Path object

PosixPath('/')

/


PosixPath('/Users/whlin')

# create a path object directly from string

In [368]:
# NOTE: even in Windows, we can still use `/` as path separator. The forward slash operator is used independently of the actual path separator on the platform

# Note that in the preceding examples, the pathlib.Path is represented by either a WindowsPath or a PosixPath. The actual object representing the path depends on the underlying operating system. (That is, the WindowsPath example was run on Windows, while the PosixPath examples have been run on Mac or Linux.)

# M1
Path("~/Downloads/Hello/World")
# M2
Path.home() / 'Downloads' / 'Hello' / 'World' # `/` has been overloaded 
# M3
Path.home().joinpath('Downloads','Hello', 'World')
# M4: join two path as well
Path("~/Downloads", "hello", 'test.txt')

PosixPath('~/Downloads/Hello/World')

PosixPath('/Users/whlin/Downloads/Hello/World')

PosixPath('/Users/whlin/Downloads/Hello/World')

PosixPath('~/Downloads/hello/test.txt')

# open files using `open` method instead of `open` function

In [792]:
# Path class has an open method, which is the same as the built-in open function!

path = Path("testing.txt")
if not path.exists():
    path.write_text('hello\nworld')

with Path("testing.txt").open('r') as f: # same as with open("testing.txt", r) as f:
    for line in f:
        print(line)

hello

world


# access file content without `open`

In [218]:
# return as a string
Path("testing.txt").read_text()
Path("testing.txt").write_text("overwritting...") # this will erase the stuff in the original file...

'overwritting...'

15

# Path object attributes

In [288]:
# Note that .parent returns a new Path object, whereas the other properties return strings. This means for instance that .parent can be chained as in the last example or even combined with / to create completely new paths:
# path = Path("~/Downloads/Hello/world/my_test.txt")

for path in [Path("~/Downloads/Hello/world/my_test.txt"), Path('~/my/dir')]:
    print('====start====')
    print(f"{path = }")
    print(f"{path.name = }")
    print(f"{path.parent = }")
    print(f"{path.parent.parent = }")
    print(f"{path.stem = }")
    print(f"{path.suffix = }")
    print(f"{path.anchor = }")
    print(f"{path.root = }")
    print(f"{path.parent / ('world' + path.suffix)  = }")
    # change to another file 
    print('====end====')

====start====
path = PosixPath('~/Downloads/Hello/world/my_test.txt')
path.name = 'my_test.txt'
path.parent = PosixPath('~/Downloads/Hello/world')
path.parent.parent = PosixPath('~/Downloads/Hello')
path.stem = 'my_test'
path.suffix = '.txt'
path.anchor = ''
path.root = ''
path.parent / ('world' + path.suffix)  = PosixPath('~/Downloads/Hello/world/world.txt')
====end====
====start====
path = PosixPath('~/my/dir')
path.name = 'dir'
path.parent = PosixPath('~/my')
path.parent.parent = PosixPath('~')
path.stem = 'dir'
path.suffix = ''
path.anchor = ''
path.root = ''
path.parent / ('world' + path.suffix)  = PosixPath('~/my/world')
====end====


# Path object methods

In [302]:
# methods of Path object 
path = Path("~/Downloads/Hello/World/good.txt")
for p in [path, path.cwd()]:
    print('====start====')
    print(f'for path {p}:')
    print(f"{p.cwd() / 'textfiles' = }") # '/' has been overloaded
    print(f"{p.home() = }")
    print(f"{p.cwd().exists() = }")
    print(f"{p.exists() = }")
    print(f"{p.is_dir() = }")
    print(f"{p.is_file() = }")
    print(f"{p.is_absolute() = }")
    print(f"{path = }")
    print(f"{path.with_name('abc.png')  = }")
    print(f"{path.with_stem('abc') = }")
    print(f"{path.with_suffix('.png') = }")
    # p.chmod()
    print('====end====')

====start====
for path ~/Downloads/Hello/World/good.txt:
p.cwd() / 'textfiles' = PosixPath('/Users/whlin/textfiles')
p.home() = PosixPath('/Users/whlin')
p.cwd().exists() = True
p.exists() = False
p.is_dir() = False
p.is_file() = False
p.is_absolute() = False
path = PosixPath('~/Downloads/Hello/World/good.txt')
path.with_name('abc.png')  = PosixPath('~/Downloads/Hello/World/abc.png')
path.with_stem('abc') = PosixPath('~/Downloads/Hello/World/abc.txt')
path.with_suffix('.png') = PosixPath('~/Downloads/Hello/World/good.png')
====end====
====start====
for path /Users/whlin:
p.cwd() / 'textfiles' = PosixPath('/Users/whlin/textfiles')
p.home() = PosixPath('/Users/whlin')
p.cwd().exists() = True
p.exists() = True
p.is_dir() = True
p.is_file() = False
p.is_absolute() = True
path = PosixPath('~/Downloads/Hello/World/good.txt')
path.with_name('abc.png')  = PosixPath('~/Downloads/Hello/World/abc.png')
path.with_stem('abc') = PosixPath('~/Downloads/Hello/World/abc.txt')
path.with_suffix('.png') =

## rename file, rename directory, create file, create directory

In [276]:
# rename files
Path("testing.txt").write_text("hello world!") # make a new file first
Path("testing.txt").with_stem("test") # get a Path object withe a name name. Will not do rename
Path("testing.txt").replace("test.txt") # get a Path object withe a name name. Will do rename. NOTE: if there exists a file with the same replaced name, then the old file will be removed as well! Be extra careful about it!

# create directory
Path('my/path/to/dir').mkdir(mode=511, parents=True, exist_ok=True) # create a new directory 

# rename directory
Path('./my/path/to/dir').rename("./my/path/to/NEW_dir") # create a new directory 

# this is wrong..
# Path('./my/path/to/dir').rename("NEW_dir") # create a new directory 

12

PosixPath('test.txt')

PosixPath('test.txt')

PosixPath('my/path/to/NEW_dir')

# glob and rglob (recursive glob)

In [278]:
sorted(Path().rglob('*.py')) # search current directory recursively for *.py files
sorted(Path().glob('*.py')) # search current directory for *.py files
for x in Path('~/Downloads').expanduser().glob('*.pdf'): # very often, we can't use `~` when we iterate a directory...
    print(x)

[PosixPath('Corey_rename_song_list/rename.py'),
 PosixPath('SEHH2042_AS_Cleaning/SEHH2042_AS_cleaning.py'),
 PosixPath('SEHH2042_GP_add group id to cpp/SEHH2042_GP_add group id to cpp.py'),
 PosixPath('SEHH2042_GP_insert students info to worksheet/SEHH2042_GP_addInfo.py'),
 PosixPath('SEHH2042_addGPID/SEHH2042_addGPID.py'),
 PosixPath('changeName.py'),
 PosixPath('change_file_names.py'),
 PosixPath('change_name_to_ID_only.py'),
 PosixPath('open a doc/open_doc.py'),
 PosixPath('reverse_lines/reverse_line.py')]

[PosixPath('changeName.py'),
 PosixPath('change_file_names.py'),
 PosixPath('change_name_to_ID_only.py')]

/Users/whlin/Downloads/ibank_interviews.pdf
/Users/whlin/Downloads/Professional Scrum Master I.pdf
/Users/whlin/Downloads/Pathlib-Cheatsheet.pdf
/Users/whlin/Downloads/Job_Application_Form.pdf
/Users/whlin/Downloads/IMM5812_2-VRV0LE2.pdf
/Users/whlin/Downloads/Leon Wei - Cracking the SQL Interview for Data Scientists_ Nervous about your SQL Interview_ Anxiety ends here. Learn, refresh and master SQL Skills in a Week. (2021).pdf
/Users/whlin/Downloads/Jeroen Janssens - Data Science at the Command Line_ Obtain, Scrub, Explore, and Model Data with Unix Power Tools-O'Reilly Media (2021).pdf
/Users/whlin/Downloads/IMM5825_2-VRV0LC9.pdf
/Users/whlin/Downloads/IMM5812_2-VRV0LE2 (1).pdf
/Users/whlin/Downloads/IMM5825_2-VRV0LC9 (1).pdf
/Users/whlin/Downloads/Malik_C++ Programming_ From Problem Analysis to Program Design[2437]2.pdf
/Users/whlin/Downloads/1.7 SARP Meeting Schedule 2122S2 - By Division.pdf
/Users/whlin/Downloads/e-service2.pdf
/Users/whlin/Downloads/Syllabus.pdf


# file iteration
- we use `Path.iterdir` instead of `os.listdir` !

In [313]:
for f in Path.cwd().iterdir():
    print(f.name, end=' ')

.Rhistory .eclipse .config Music .julia .condarc out.txt play_.txt .DS_Store .CFUserTextEncoding my_out.txt bin less .bashrc .local .psql_history Pictures playing.txt .zprofile DM_notes.pdf .zsh_history .ipython Desktop Library .matplotlib .lesshst Parallels miniforge3 seaborn-data .emacs.d scikit_learn_data .cups .bash_sessions Public play.txt .vscode-R .RData OneDrive - HKUST Connect Applications (Parallels) Movies d~ .emacs Applications .emacs~ .Rapp.history .Trash test.txt .ipynb_checkpoints .tldrc .jupyter .keras .vscode-insiders Documents sys_info_page.html error.txt .vscode .bash_profile Downloads .python_history .gitconfig d .bash_history .viminfo .zsh_sessions My Drive .conda ls-output.txt 

In [327]:
def count_suffix(path):
    # count the extension of the files in a directory
    return Counter(p.suffix for p in Path(path).expanduser().iterdir())    

# path = '~/Downloads'
# path = Path.cwd()
path = Path.home() / 'Downloads'
count_suffix(path)

Counter({'': 13,
         '.pdf': 14,
         '.torrent': 4,
         '.docx': 1,
         '.sql': 3,
         '.xls': 1,
         '.rar': 2,
         '.xlsx': 6,
         '.md': 1,
         '.zip': 4,
         '.ini': 1,
         '.epub': 1})

In [542]:
def create_dir(path, n_dir=3, n_doc=4):
    # in the given directory `path`, create dir_{1,2,..n_dir}, and inside each dir_*, create doc_{1,2,..,n_doc}.txt
    for i in range(1, n_dir+1):
        # cur_path = Path(path).expanduser().joinpath(f'dir_{i}')
        cur_path = Path(path, f'dir_{i}').expanduser()
        print(cur_path)

        cur_path.mkdir(parents=True, exist_ok=True)
        
        for j in range(1, n_doc+1):
            Path(cur_path, f'doc_{j}.txt').write_text(f"doc_{j}")

def tree(path):
    path = Path(path).expanduser()
    print(f'- {path}')
    for p in sorted(path.rglob('*')):
        depth = len(p.relative_to(path).parts) 
        spacer = '    '*depth
        print(f'{spacer}- {p.name}')
        # print(p)

def unique_path(directory, name_pattern):
    counter = 0
    while True:
        counter += 1
        path = Path(directory, name_pattern.format(counter))
        if not path.exists():
            return path


# path = Path.home()/'playing'
# path = '~/hello/world'
path = '~/Desktop/hello/world'
create_dir(path)

path = "~/Desktop/hello"
tree(path)

path = unique_path('~/Desktop/hello', 'test{:03d}.txt')
path


/Users/whlin/Desktop/hello/world/dir_1
/Users/whlin/Desktop/hello/world/dir_2
/Users/whlin/Desktop/hello/world/dir_3
- /Users/whlin/Desktop/hello
    - world
        - dir_1
            - doc_1.txt
            - doc_2.txt
            - doc_3.txt
            - doc_4.txt
        - dir_2
            - doc_1.txt
            - doc_2.txt
            - doc_3.txt
            - doc_4.txt
        - dir_3
            - doc_1.txt
            - doc_2.txt
            - doc_3.txt
            - doc_4.txt


PosixPath('~/Desktop/hello/test001.txt')

In [548]:
def walk(path):
    # print all files in a directory (including subdirectories inside)
    # M1: recursion
    for file in sorted(Path(path).expanduser().iterdir()):
        if file.is_file():
            print(file)
        else:
            walk(file)
    # M2: rglob
    # for file in sorted(Path(path).expanduser().rglob('*')):
    # for file in Path(path).expanduser().rglob('*'):
    #     if file.is_file():
    #         print(file)

    
walk('~/Desktop/hello/world')

/Users/whlin/Desktop/hello/world/.DS_Store
/Users/whlin/Desktop/hello/world/dir_1/doc_1.txt
/Users/whlin/Desktop/hello/world/dir_1/doc_2.txt
/Users/whlin/Desktop/hello/world/dir_1/doc_3.txt
/Users/whlin/Desktop/hello/world/dir_1/doc_4.txt
/Users/whlin/Desktop/hello/world/dir_2/doc_1.txt
/Users/whlin/Desktop/hello/world/dir_2/doc_2.txt
/Users/whlin/Desktop/hello/world/dir_2/doc_3.txt
/Users/whlin/Desktop/hello/world/dir_2/doc_4.txt
/Users/whlin/Desktop/hello/world/dir_3/doc_1.txt
/Users/whlin/Desktop/hello/world/dir_3/doc_2.txt
/Users/whlin/Desktop/hello/world/dir_3/doc_3.txt
/Users/whlin/Desktop/hello/world/dir_3/doc_4.txt


# Path object attributes

# function to add prefix/suffix to the files of a directory

In [472]:
def add_prefix_suffix(path, ext='', prefix='', suffix=''):
    # ext: filter by extension 
    # use case: add timestamp to each file
    pattern = f'*.{ext}' if ext else '*'

    for file in sorted(Path(path).expanduser().glob(pattern)):
        # print(file)
        new_name = '_'.join(x for x in [prefix,file.stem,suffix] if x)
        file.rename(file.with_stem(new_name))

# old code 
# def change_file_names(path, ext='png', prefix=None, suffix=None):
#     for file in os.listdir(path):
#         # print(type(file))
#         if file.endswith(ext):
#             new_name = file
#             if prefix:
#                 # new_name = prefix + '_' + new_name
#                 new_name = f'{prefix}_{new_name}'
#             if suffix:
#                 # new_name = new_name + '_' + suffix
#                 new_name = f'{new_name}_{suffix}'
#             print(new_name)
#             os.rename(os.path.join(path, file), os.path.join(path, new_name))


path = '~/Desktop/hello/world/dir_1'
add_prefix_suffix(path, prefix='pref', suffix='3')


In [533]:
def print_file(path, pattern='*' ,start=None, end=None, strip=False, output_path=None):
    res = []
    for file in sorted(Path(path).expanduser().glob(pattern)):
        if strip: 
            res.append(str(file)[start:end].strip()) 
        else: 
            res.append(str(file)[start:end]) 
    if output_path:
        Path(output_path).expanduser().write_text('\n'.join(res))
    print(*res)
    return res

path = '~/Desktop/hello/world/dir_1'
# res = print_file(path, start=-15, end=None)
res = print_file(path, strip=True)

/Users/whlin/Desktop/hello/world/dir_1/doc_1.txt /Users/whlin/Desktop/hello/world/dir_1/doc_2.txt /Users/whlin/Desktop/hello/world/dir_1/doc_3.txt /Users/whlin/Desktop/hello/world/dir_1/doc_4.txt /Users/whlin/Desktop/hello/world/dir_1/my_out.txt /Users/whlin/Desktop/hello/world/dir_1/pref_pref_pref_pref_pref_pref_doc_1_3_3_3.txt /Users/whlin/Desktop/hello/world/dir_1/pref_pref_pref_pref_pref_pref_doc_2_3_3_3.txt /Users/whlin/Desktop/hello/world/dir_1/pref_pref_pref_pref_pref_pref_doc_3_3_3_3.txt /Users/whlin/Desktop/hello/world/dir_1/pref_pref_pref_pref_pref_pref_doc_4_3_3_3.txt


In [603]:
def open_file(filename):
    if sys.platform == "win32":
        os.startfile(filename)
    else:
        opener = "open" if sys.platform == "darwin" else "xdg-open"
        subprocess.call([opener, filename])

def open_doc(path, pattern='*', start=None, end=None, recursive=True):
    res = []
    for file in eval(f"sorted(Path(path).expanduser().{'r' if recursive else ''}glob(pattern))"):
        if file.is_file():
            res.append(file)
    print(f"number of documents in the directory = {len(res)}")

    for i in range(start-1, end):
        open_file(str(res[i]))
        time.sleep(0.8)

# path = '~/Desktop/hello/world/dir_1'
path = '/Users/whlin/Library/CloudStorage/OneDrive-HKUSTConnect/Academic/Notes of HKCC/HKCC Teaching/SEHH2239/exam things/201'

start=int(input("enter start index(start from 1): "))
end=int(input("enter end index: "))

open_doc(path, pattern='*.pdf', start=start, end=end, recursive=True)



number of documents in the directory = 100


In [625]:
def reverse_lines(file, output_name):
    file = Path(file).expanduser()
    text = file.read_text()
    # print(text)
    res = '\n'.join(text.strip().split('\n')[::-1])
    # print(res)
    file.with_name(output_name).write_text(res)

path = '/Users/whlin/Library/CloudStorage/OneDrive-HKUSTConnect/Documents/python/my_notes_py/ad-hoc code/file_management/reverse_lines/chinese_text.txt'
reverse_lines(path, output_name='testttt.txt')

摩星嶺白屋(芝加哥大學)
摩星嶺徑
碧珊徑
龍虎山健身徑
夏力道 
山頂廣場
白加道
馬己仙峽
灣仔峽
布力徑
黃泥涌峽
大潭水塘道
大潭上水塘
港島林道美景路段
石澳道
港島林道哥連臣山段
馬塘坳
砵甸乍山郊遊徑
龍躍徑
小西灣海濱公園
小西灣海濱公園
龍躍徑
砵甸乍山郊遊徑
馬塘坳
港島林道哥連臣山段
石澳道
港島林道美景路段
大潭上水塘
大潭水塘道
黃泥涌峽
布力徑
灣仔峽
馬己仙峽
白加道
山頂廣場
夏力道 
龍虎山健身徑
碧珊徑
摩星嶺徑
摩星嶺白屋(芝加哥大學)


In [711]:
def add_id(path, works, pattern):
    """ Append each document with an given gid.
        EX: 1234556A_ASM => 1_1234556A_ASM if the document has gid 1 
        When grading group project, we want to append the group id of each student to their submitted p2p files
        - files: id: gip mapping
            19186232A	1
            19011630A	1
            20190501A	1
            20018393A	1
            20208582A	1
            20010776A	2
            20093215A	2
            20009962A	2
            20165160A	2
            20205402A	2
            20013324A	2
            20077332A	3
            20080999A	3
        - directory containing students' work:
            19186232A_my_work.txt
            19011630A_abc.txt
    """
    path = Path(path).expanduser()

    r = re.compile('([1-2][0-9]{7}A)')

    # M1: use dict to get sid:gp pairs
    # d = {}
    #     for line in f:
    #         sid, gp = line.strip().split('\t')
    #         d[sid] = gp

    # M2: use Series with sid as index 
    d = pd.read_csv(path, sep='\t', header=None, names=['sid','gp_id'], index_col='sid').iloc[:,0]

    for f in sorted(Path(works).expanduser().glob(pattern)):
        # extracted_id = f.stem[:9].upper()
        extracted_id = r.findall(f.stem)[0].upper() if r.findall(f.stem) else ''
        # print(extracted_id)
        if extracted_id in d.index:
            # M1: just add extracted d[sid] (the gp id) as prefix
            # print(f.with_stem(f"{d[extracted_id]}_{f.stem}"))
            # f.rename(f.with_stem(f"{d[extracted_id]}_{f.stem}"))

            # M2: just add extracted d[sid] (the gp id) as prefix, and rename the original file to have the sid only
            print(f.with_stem(f"{d[extracted_id]}_{extracted_id}"))
            # f.rename(f.with_stem(f"{d[extracted_id]}_{f.stem}"))

path = '/Users/whlin/Library/CloudStorage/OneDrive-HKUSTConnect/Documents/python/my_notes_py/ad-hoc code/file_management/SEHH2042_addGPID/SEHH2042_stuID+gp.txt'

works = '/Users/whlin/Library/CloudStorage/OneDrive-HKUSTConnect/Documents/python/my_notes_py/ad-hoc code/file_management/SEHH2042_work'
add_id(path, works=works, pattern='*.txt')

/Users/whlin/Library/CloudStorage/OneDrive-HKUSTConnect/Documents/python/my_notes_py/ad-hoc code/file_management/SEHH2042_work/1_19011630A.txt
/Users/whlin/Library/CloudStorage/OneDrive-HKUSTConnect/Documents/python/my_notes_py/ad-hoc code/file_management/SEHH2042_work/1_19186232A.txt


In [779]:
def rm_tree(path):
    # recursively remove the directory `path`
    # IMPORTANT: very dangerous!!!
    path = Path(path).expanduser()
    for f in path.rglob('*'):
        if f.is_file():
            f.unlink()
        elif f.is_dir():
            rm_tree(f)
    path.rmdir()

def rm_files(path):
    path = Path(path).expanduser()
    for f in path.rglob('*'):
        if f.is_file():
            f.unlink()
import shutil
path = '~/Desktop/song_list_exercise'
# rm_tree(path)
# rm_files(path)
shutil.rmtree(Path(path).expanduser())

In [782]:
def create_song_list(path):
    # create song list in format
    # gp1_1 - gp2_1 - #1.txt
    path = Path(path).expanduser()
    path.mkdir(parents=True, exist_ok=True)
    for i in range(1, 11):
        path.joinpath(f"gp1_{i} - gp2_{i} - #{i}.txt").write_text('')

def rename_song_list(path, pattern='*'):
    # rename song file to this format
    # 001-gp1_1-gp2_1.txt
    path = Path(path).expanduser()
    for f in path.glob(pattern):
        gp1, gp2, num = [x.strip() for x in f.stem.split('-')]
        num = num[1:].zfill(3)
        new_stem = f"{num}-{gp1}-{gp2}"
        # print(new_stem)
        # print(f.with_stem(new_stem))
        f.rename(f.with_stem(new_stem))

path = '~/Desktop/song_list_exercise'
create_song_list(path)
# rename_song_list(path, pattern='[!.]*')
