# Notebook for loading excel files

### Function that converts .xlsx files to .xls 

In [None]:
import pyexcel as p

def xlsx_to_xls(filename):
    """Gets the filename of the excel file you want to chenge the format from xlsx to xls.

    Parameters
    ----------
    filename : str
        The complete path of the file, the format of which you want to modify.
        The file must be in the format xlsx.

    Returns
    -------
    None
        Creates a new file and saves it in the same directory with the orginal file,
        but with format xls.

    """
    if not filename.endswith('.xlsx'):
        raise Exception('Invalid File Format')

    else:
        dest_filename = filename.replace('.xlsx', '.xls')
        print(dest_filename)
        p.save_as(file_name = filename, dest_file_name = dest_filename)
    



xlsx_to_xls('/home/billys/Documents/thesis-bill/DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-02_2017-1.xlsx')

### Function that reads a single excel file

The function below reads an excel file by its name, modifies its format and
merges all the sheets in a single dataframe

In [20]:
import os
import pandas as pd
import datetime


def read_excel_file(filename):
    """Gets a filename of an excel file that has an xls format and returns
        a merged dataframe with all sheets together

    Parameters
    ----------
    filename : str
        The full path of the excel file

    Returns
    -------
    DataFrame
        A dataframe that has all the sheets of the excel file in the format needed
    """
    dfs_dict = pd.read_excel(filename, sheet_name=None, engine='xlrd')
    print("The number of sheets in this excel file is", len(dfs_dict))
    print(dfs_dict.keys())
    prev_df = pd.DataFrame()
    result = pd.DataFrame()
    for df in dfs_dict.values():
        df.columns = df.iloc[1]
        
        for i in range(2, len(df['Time'])):
            if type(df['Time'][i]) is datetime.datetime:
                df['Time'][i] = datetime.datetime.combine(df['Time'][0].date(), df['Time'][i].time())
        df = df.iloc[2:26, 1:]
        
        result = pd.concat([prev_df, df], ignore_index=True)
        prev_df = result

    return result
    
read_excel_file('/home/billys/Documents/thesis-bill/DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-01_2015.xls')
    

The number of sheets in this excel file is 31
dict_keys(['01.01.2015', '02.01.2015', '03.01.2015', '04.01.2015', '05.01.2015', '06.01.2015', '07.01.2015', '08.01.2015', '09.01.2015', '10.01.2015', '11.01.2015', '12.01.2015', '13.01.2015', '14.01.2015', '15.01.2015', '16.01.2015', '17.01.2015', '18.01.2015', '19.01.2015', '20.01.2015', '21.01.2015', '22.01.2015', '23.01.2015', '24.01.2015', '25.01.2015', '26.01.2015', '27.01.2015', '28.01.2015', '29.01.2015', '30.01.2015', '31.01.2015'])


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


1,Time,AGIA TRIADA,SIDIROKASTRO,KIPI,NaN,AGIOI THEODOROI,ALOYMINION,ALOYMINION II,ALOYMINION III,ATHENS,...,SPATA,MOTOR OIL,MOTOR OIL II,XANTHI,OINOFYTA,PLATY,SALFA ANTHOUSSA,SALFA ANO LIOSSIA,SERRES,TRIKALA
0,2015-01-01 09:00:00,1.86764,2364.74,782.328,,0.136442,435.095,0,90.9743,1022.78,...,11.1896,0,0,3.11111,32.4378,22.0459,5.06585,6.07902,43.2,36.4957
1,2015-01-01 10:00:00,0,2397.38,512.001,,0.137161,433.904,0,95.3129,964.889,...,11.1699,0,0,0.497778,33.9043,24.1939,3.63352,4.36022,42.8444,41.0441
2,2015-01-01 11:00:00,219.046,2438.46,513.324,,0.0921605,433.683,0,95.1789,909.655,...,10.9345,9.3649,0,2.18667,33.7282,23.5737,0.0480136,0.0576163,43.68,38.4149
3,2015-01-01 12:00:00,738.346,2438.65,513.631,,0.0933327,434.947,0,90.6915,946.129,...,11.1816,25.0155,0,0.604444,33.2008,22.2877,1.70794,2.04953,44.3022,35.5121
4,2015-01-01 13:00:00,715.792,2462.36,512.974,,0.146365,436.569,0,90.0832,1090.82,...,9.84795,2.58689,0,2.61333,33.894,20.1344,2.7108,3.25296,44.6578,34.5842
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2015-01-31 04:00:00,1768.63,1629.47,0,,0.113794,422.186,0,82.3101,241.832,...,7.0627,0,0,2.64889,109.256,12.888,1.26389,1.51667,34.1333,9.19587
740,2015-01-31 05:00:00,1775.48,1639.24,0,,1.10098,421.428,0,83.9954,240.088,...,6.73343,0,0,3.32444,93.9685,13.1835,0.405599,0.486719,31.6444,9.10032
741,2015-01-31 06:00:00,1768.13,1659.02,0,,2.59893,421.301,0,82.2137,256.186,...,6.90467,0,0,2.88,88.4709,12.9348,6.76081,8.11297,29.1556,10.2563
742,2015-01-31 07:00:00,1782.08,1671.58,0,,0.992284,421.592,0,83.1517,369.37,...,6.36038,23.2564,0,2.79111,92.2171,12.9419,4.81906,5.78287,26.9511,11.6964


### Function that reads all excel files in multiple folders and merges them into a single dataframe

In [1]:
import os
import pandas as pd
import datetime


def read_files(folder_path, prefix):
    """Gets as input a directory that contains all excel files
    along with a prefix that every single file folder starts with and
    returns a dataframe that contains all data of all files

    Parameters
    ----------
    folder_path : str
        The full path to the directory that contains other directories
        that contain the files we want to load
    
    prefix : str
        A prefix that exists in the beginning of every folder name

    Returns
    -------
    DataFrame
        A dataframe that contains all data of all excel files merged into a single one dataframe
    """
    
    prev_df = pd.DataFrame()
    result = pd.DataFrame()
    for foldername in os.listdir(folder_path):
        if foldername.startswith(prefix):
            file_path = folder_path
            file_path = os.path.join(folder_path, foldername)
            # Open all excel files from those folders
            print(os.listdir(file_path))
            for filename in os.listdir(file_path):
                temp_filename = os.path.join(file_path, filename)
                if temp_filename.endswith('.xls'):
                    print(temp_filename)
                    dfs_dict = pd.read_excel(temp_filename, engine='xlrd', sheet_name=None)
                    for df in dfs_dict.values():
                        df.columns = df.iloc[1]

                        for i in range(2, len(df['Time'])):
                            if type(df['Time'][i]) is datetime.datetime:
                                df['Time'][i] = datetime.datetime.combine(df['Time'][0].date(), df['Time'][i].time())
                        df = df.iloc[2:26, 1:]

                        result = pd.concat([prev_df, df], ignore_index=True)
                        prev_df = result
            print(result)

    return result
    
read_files('../DATA/ENERSHARE_DATA _NTUA', 'Flows')
    

['Hourly-Flows-05_2016.xls', 'Hourly-Flows-09_2016-1.xls', 'Hourly-Flows-04_2016.xls', 'Hourly-Flows-01_2016.xls', 'Hourly-Flows-12_2016.xls', 'Hourly-Flows-02_2016-1.xls', 'Hourly-Flows-06_2016-2.xls', 'Hourly-Flows-03_2016.xls', 'Hourly-Flows-08_2016.xls', 'Hourly-Flows-10_2016.xls', 'Hourly-Flows-07_2016.xls']
../DATA/ENERSHARE_DATA _NTUA/Flows 2016 - Hourly - Excel/Hourly-Flows-05_2016.xls


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


../DATA/ENERSHARE_DATA _NTUA/Flows 2016 - Hourly - Excel/Hourly-Flows-09_2016-1.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2016 - Hourly - Excel/Hourly-Flows-04_2016.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2016 - Hourly - Excel/Hourly-Flows-01_2016.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2016 - Hourly - Excel/Hourly-Flows-12_2016.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2016 - Hourly - Excel/Hourly-Flows-02_2016-1.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2016 - Hourly - Excel/Hourly-Flows-06_2016-2.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2016 - Hourly - Excel/Hourly-Flows-03_2016.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2016 - Hourly - Excel/Hourly-Flows-08_2016.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2016 - Hourly - Excel/Hourly-Flows-10_2016.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2016 - Hourly - Excel/Hourly-Flows-07_2016.xls
                     Time AGIA TRIADA SIDIROKASTRO     KIPI NaN  \
0     2016-05-01 09:00:00           0       1806.4  1061.82 NaN   
1     2016-05-01 10:00:00           0   

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


../DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-06_2017-2.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-01_2017-2.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-08_2017.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-03_2017.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-09_2017.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-11_2017.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-10_2017.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-04_2017.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-05_2017-2.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-07_2017-1.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-02_2017-1.xls
                      Time AGIA TRIADA SIDIROKASTRO     KIPI NaN  \
0      2016-05-01 09:00:00     

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


../DATA/ENERSHARE_DATA _NTUA/Flows 2014 - Hourly - Excel/Hourly-Flows-11_2014.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2014 - Hourly - Excel/Hourly-Flows-09_2014 (1).xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2014 - Hourly - Excel/Hourly-Flows-12_2014 (1).xls
                      Time AGIA TRIADA SIDIROKASTRO     KIPI NaN  \
0      2016-05-01 09:00:00           0       1806.4  1061.82 NaN   
1      2016-05-01 10:00:00           0      1383.59  1062.41 NaN   
2      2016-05-01 11:00:00           0      1197.51  1062.15 NaN   
3      2016-05-01 12:00:00           0      1216.02  1063.13 NaN   
4      2016-05-01 13:00:00           0      1231.65  1061.31 NaN   
...                    ...         ...          ...      ...  ..   
19747  2014-12-31 04:00:00     2209.78      2368.06  1068.13 NaN   
19748  2014-12-31 05:00:00      2204.1      2355.87  1064.45 NaN   
19749  2014-12-31 06:00:00     2206.08      2383.44  1067.98 NaN   
19750  2014-12-31 07:00:00     2196.84      2387.63  1231.16 NaN  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-07_2015.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-03_20151.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-10_2015.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-04_2015.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-12_2015.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-05_2015.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-11_2015.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-02_2015.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-09_20153.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-01_2015.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-08_2015.xls
                      Time AGIA TRIADA SIDIROKASTRO     KIPI NaN  \
0      2016-05-01 09:00:00  

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


../DATA/ENERSHARE_DATA _NTUA/Flows 2018 - Hourly - Excel/hourly_flows_04_2018.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2018 - Hourly - Excel/Hourly-Flows-02_2018.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2018 - Hourly - Excel/hourly_flows_06_2018.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2018 - Hourly - Excel/hourly_flows_05_2018.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2018 - Hourly - Excel/Hourly-Flows-01_2018.xls
../DATA/ENERSHARE_DATA _NTUA/Flows 2018 - Hourly - Excel/Hourly-Flows-03_2018.xls
                      Time AGIA TRIADA SIDIROKASTRO     KIPI NaN  \
0      2016-05-01 09:00:00           0       1806.4  1061.82 NaN   
1      2016-05-01 10:00:00           0      1383.59  1062.41 NaN   
2      2016-05-01 11:00:00           0      1197.51  1062.15 NaN   
3      2016-05-01 12:00:00           0      1216.02  1063.13 NaN   
4      2016-05-01 13:00:00           0      1231.65  1061.31 NaN   
...                    ...         ...          ...      ...  ..   
33595  2018-03-31 03:00:00      

Unnamed: 0,Time,AGIA TRIADA,SIDIROKASTRO,KIPI,NaN,AGIOI THEODOROI,ALOYMINION,ALOYMINION II,ALOYMINION III,ATHENS,...,NaN.1,NaN.2,NaN.3,NaN.4,NaN.5,NaN.6,NaN.7,NaN.8,NaN.9,NaN.10
0,2016-05-01 09:00:00,0,1806.4,1061.82,,0,433.148,0,86.8998,125.045,...,,,,,,,,,,
1,2016-05-01 10:00:00,0,1383.59,1062.41,,0,432.006,0,81.6923,118.328,...,,,,,,,,,,
2,2016-05-01 11:00:00,0,1197.51,1062.15,,0,431.434,0,55.7734,109.389,...,,,,,,,,,,
3,2016-05-01 12:00:00,0,1216.02,1063.13,,0,431.758,0,84.5577,102.206,...,,,,,,,,,,
4,2016-05-01 13:00:00,0,1231.65,1061.31,,0,432.324,0,83.7748,90.3504,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33595,2018-03-31 03:00:00,0,2.48204e+06,588944,,1542.18,433995,0,91753.6,199606,...,,,,,,,,,,
33596,2018-03-31 04:00:00,0,2.48235e+06,588008,,3026.65,432942,0,93759.8,191899,...,,,,,,,,,,
33597,2018-03-31 05:00:00,0,2.48564e+06,589533,,1804.73,431970,0,94659.9,188597,...,,,,,,,,,,
33598,2018-03-31 06:00:00,0,2.47859e+06,592214,,2182.13,434411,0,93784.7,205407,...,,,,,,,,,,
