# Python Glob Tutorial

In [13]:
import pandas as pd
import glob
import os

In [7]:
# set search path and glob for files
# here we want to look for csv files in the input directory
path = 'input'
files = glob.glob(path + '/*.csv')
files

['input/KRO.csv',
 'input/MSFT.csv',
 'input/TSLA.csv',
 'input/GHC.csv',
 'input/AAPL.csv']

In [11]:
# create empty list to store dataframes
li = []

# loop through list of files and read each one into a dataframe and append to list
for f in files:
    # read in csv
    temp_df = pd.read_csv(f)
    # append df to list
    li.append(temp_df)
    print(f'Successfully created dataframe for {f} with shape {temp_df.shape}')

# concatenate our list of dataframes into one!
df = pd.concat(li, axis=0)
print(df.shape)
df.head()

Successfully created dataframe for input/KRO.csv with shape (1258, 6)
Successfully created dataframe for input/MSFT.csv with shape (1258, 6)
Successfully created dataframe for input/TSLA.csv with shape (1258, 6)
Successfully created dataframe for input/GHC.csv with shape (1258, 6)
Successfully created dataframe for input/AAPL.csv with shape (1258, 6)
(6290, 6)


Unnamed: 0,time,open,high,low,close,volume
0,2016-01-11 00:00:00-05:00,98.97,99.06,97.34,98.53,42721303
1,2016-01-12 00:00:00-05:00,100.55,100.69,98.8399,99.96,41812613
2,2016-01-13 00:00:00-05:00,100.32,101.19,97.3,97.41,55622868
3,2016-01-14 00:00:00-05:00,97.96,100.48,95.74,99.51,54777253
4,2016-01-15 00:00:00-05:00,96.2,97.71,95.36,97.09,62629182


In [21]:
# we have a problem ....
# how do we know which rows belong to which stock?
# we can add a column with this information

li = []

# loop through list of files and read each one into a dataframe and append to list
for f in files:
    # get filename
    stock = os.path.basename(f)
    # read in csv
    temp_df = pd.read_csv(f)
    # create new column with filename
    temp_df['ticker'] = stock
    # data cleaning to remove the .csv
    temp_df['ticker'] = temp_df['ticker'].replace('.csv', '', regex=True)
    # append df to list
    li.append(temp_df)
    print(f'Successfully created dataframe for {stock} with shape {temp_df.shape}')

# concatenate our list of dataframes into one!
df = pd.concat(li, axis=0)
print(df.shape)
df.head()

Successfully created dataframe for KRO.csv with shape (1258, 7)
Successfully created dataframe for MSFT.csv with shape (1258, 7)
Successfully created dataframe for TSLA.csv with shape (1258, 7)
Successfully created dataframe for GHC.csv with shape (1258, 7)
Successfully created dataframe for AAPL.csv with shape (1258, 7)
(6290, 7)


Unnamed: 0,time,open,high,low,close,volume,ticker
0,2016-01-11 00:00:00-05:00,98.97,99.06,97.34,98.53,42721303,KRO
1,2016-01-12 00:00:00-05:00,100.55,100.69,98.8399,99.96,41812613,KRO
2,2016-01-13 00:00:00-05:00,100.32,101.19,97.3,97.41,55622868,KRO
3,2016-01-14 00:00:00-05:00,97.96,100.48,95.74,99.51,54777253,KRO
4,2016-01-15 00:00:00-05:00,96.2,97.71,95.36,97.09,62629182,KRO


In [37]:
### Searching for text in files

# set filepath to search
path = '/Users/tara/ml_guides/' + '**/*.ipynb'

# string to search for
search_term = 'kdeplot'

# empty list to hold files that contain matching string
files_to_check = []

# looping through all the filenames returned
# set recursive = True to look in sub-directories too
for filename in glob.iglob(path, recursive=True):
    # adding error handling just in case!
    try:
        with open(filename) as f:
            # read the file as a string
            contents = f.read()
            # if the search term is found append to the list of files
            if(search_term in contents):
                files_to_check.append(filename)
    except:
        pass

files_to_check

['/Users/tara/ml_guides/superhero-exploratory-analysis.ipynb',
 '/Users/tara/ml_guides/glob/glob_tutorial.ipynb']