Reading Multiple File 

__Task:__ 

Sometimes your data is not present in just one file but split in multiple files. And you want to read them all and combine into one single dataframe. 



__Approach 1:__

In [1]:
import glob
csvfiles = []

# provide path in relation to working directory or fill folder path
csvfiles = glob.glob(r"../Datasets/AReM/lying\*.csv")
print(*csvfiles, sep="\n")


../Datasets/AReM/lying\dataset1.csv
../Datasets/AReM/lying\dataset10.csv
../Datasets/AReM/lying\dataset11.csv
../Datasets/AReM/lying\dataset12.csv
../Datasets/AReM/lying\dataset13.csv
../Datasets/AReM/lying\dataset14.csv
../Datasets/AReM/lying\dataset15.csv
../Datasets/AReM/lying\dataset2.csv
../Datasets/AReM/lying\dataset3.csv
../Datasets/AReM/lying\dataset4.csv
../Datasets/AReM/lying\dataset5.csv
../Datasets/AReM/lying\dataset6.csv
../Datasets/AReM/lying\dataset7.csv
../Datasets/AReM/lying\dataset8.csv
../Datasets/AReM/lying\dataset9.csv


In [2]:
import os
import pandas as pd

list_df = []


for csvfile in csvfiles:
    fpath = csvfile.replace("\\","/")
    print("Reading: ", fpath.ljust(40), "Exists: ", os.path.exists(fpath))
    df = pd.read_csv(fpath, skiprows=4, header=0)

    # Add filename column
    csv_name = csvfile.split('\\')[-1].split('.')[0]
    df['file'] = csv_name
    
    # Add df to a list
    list_df.append(df)
    
#concat all the df in the list
final_df = pd.concat(list_df)

Reading:  ../Datasets/AReM/lying/dataset1.csv      Exists:  True
Reading:  ../Datasets/AReM/lying/dataset10.csv     Exists:  True
Reading:  ../Datasets/AReM/lying/dataset11.csv     Exists:  True
Reading:  ../Datasets/AReM/lying/dataset12.csv     Exists:  True
Reading:  ../Datasets/AReM/lying/dataset13.csv     Exists:  True
Reading:  ../Datasets/AReM/lying/dataset14.csv     Exists:  True
Reading:  ../Datasets/AReM/lying/dataset15.csv     Exists:  True
Reading:  ../Datasets/AReM/lying/dataset2.csv      Exists:  True
Reading:  ../Datasets/AReM/lying/dataset3.csv      Exists:  True
Reading:  ../Datasets/AReM/lying/dataset4.csv      Exists:  True
Reading:  ../Datasets/AReM/lying/dataset5.csv      Exists:  True
Reading:  ../Datasets/AReM/lying/dataset6.csv      Exists:  True
Reading:  ../Datasets/AReM/lying/dataset7.csv      Exists:  True
Reading:  ../Datasets/AReM/lying/dataset8.csv      Exists:  True
Reading:  ../Datasets/AReM/lying/dataset9.csv      Exists:  True


In [3]:
final_df.head()

Unnamed: 0,# Columns: time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23,file
0,0,29.0,0.0,9.0,0.71,8.5,0.5,dataset1
1,250,29.0,0.0,8.0,0.71,8.5,0.5,dataset1
2,500,29.0,0.0,8.0,0.71,8.0,1.0,dataset1
3,750,28.5,0.5,8.25,0.43,8.75,0.43,dataset1
4,1000,29.0,0.0,8.75,1.09,9.0,0.0,dataset1


In [4]:
final_df.shape

(7200, 8)

In [5]:
df.shape

(480, 8)

In [6]:
final_df.columns

Index(['# Columns: time', 'avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13',
       'avg_rss23', 'var_rss23', 'file'],
      dtype='object')

In [7]:
df.columns

Index(['# Columns: time', 'avg_rss12', 'var_rss12', 'avg_rss13', 'var_rss13',
       'avg_rss23', 'var_rss23', 'file'],
      dtype='object')

__Approach 2: Generator Approach__

Put it all in one function call.

In [8]:
df = pd.concat(pd.read_csv(fpath, skiprows=4, header=0) for fpath in csvfiles)
df

Unnamed: 0,# Columns: time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23
0,0,29.00,0.00,9.00,0.71,8.50,0.50
1,250,29.00,0.00,8.00,0.71,8.50,0.50
2,500,29.00,0.00,8.00,0.71,8.00,1.00
3,750,28.50,0.50,8.25,0.43,8.75,0.43
4,1000,29.00,0.00,8.75,1.09,9.00,0.00
...,...,...,...,...,...,...,...
475,118750,41.50,0.50,10.67,0.47,14.00,0.82
476,119000,41.50,0.50,10.80,0.40,14.40,0.80
477,119250,41.75,0.43,10.00,0.00,13.67,0.94
478,119500,42.00,0.00,9.40,0.49,14.00,1.10


If you care about adding the filename as a new column, define it in a function function.

In [9]:
def read(fpath):
    df = pd.read_csv(fpath, skiprows=4, header=0)
    csv_name = csvfile.split('/')[-1].split('.')[0]
    df['file'] = csv_name
    return df

In [10]:
df = pd.concat(read(fpath) for fpath in csvfiles)
df

Unnamed: 0,# Columns: time,avg_rss12,var_rss12,avg_rss13,var_rss13,avg_rss23,var_rss23,file
0,0,29.00,0.00,9.00,0.71,8.50,0.50,lying\dataset9
1,250,29.00,0.00,8.00,0.71,8.50,0.50,lying\dataset9
2,500,29.00,0.00,8.00,0.71,8.00,1.00,lying\dataset9
3,750,28.50,0.50,8.25,0.43,8.75,0.43,lying\dataset9
4,1000,29.00,0.00,8.75,1.09,9.00,0.00,lying\dataset9
...,...,...,...,...,...,...,...,...
475,118750,41.50,0.50,10.67,0.47,14.00,0.82,lying\dataset9
476,119000,41.50,0.50,10.80,0.40,14.40,0.80,lying\dataset9
477,119250,41.75,0.43,10.00,0.00,13.67,0.94,lying\dataset9
478,119500,42.00,0.00,9.40,0.49,14.00,1.10,lying\dataset9
