# Notebook for loading excel files

### Function that converts .xlsx files to .xls 

In [None]:
import pyexcel as p

def xlsx_to_xls(filename):
    """Gets the filename of the excel file you want to chenge the format from xlsx to xls.

    Parameters
    ----------
    filename : str
        The complete path of the file, the format of which you want to modify.
        The file must be in the format xlsx.

    Returns
    -------
    None
        Creates a new file and saves it in the same directory with the orginal file,
        but with format xls.

    """
    if not filename.endswith('.xlsx'):
        raise Exception('Invalid File Format')

    else:
        dest_filename = filename.replace('.xlsx', '.xls')
        print(dest_filename)
        p.save_as(file_name = filename, dest_file_name = dest_filename)
    



xlsx_to_xls('/home/billys/Documents/thesis-bill/DATA/ENERSHARE_DATA _NTUA/Flows 2017 - Hourly- Excel/Hourly-Flows-02_2017-1.xlsx')

### Functions for adding holidays and weekdays in our dataframe 

In [2]:
from datetime import date
import datetime
import holidays


'''
def merge_date_and_time(df, date):
    if type(df['DateTime']) is datetime.datetime:
        return datetime.datetime.combine(date, df['DateTime'].time())
Function that does not work for now'''   


def holiday_func(df):
    """Gets a df, checks the date of a given row and returns True
        if the date belongs in holiday based on country

    Parameters
    ----------
    df : DataFrame     

    Returns
    -------
    Boolean
        Returns True if the datetime is a holiday based on the country or False if it is not.
    """
    if df['DateTime'] in holidays.country_holidays('GR'):
        return True
    else:
        return False
    
    
def weekday_func(df):
    """Gets a dataframe and adds a column that has the weekday of a given row

    Parameters
    ----------
    df : DataFrame     

    Returns
    -------
    None
    """
    return df['DateTime'].weekday()




### Function that reads a single excel file

The function below reads an excel file by its name, modifies its format and
merges all the sheets in a single dataframe

In [3]:
import os
import pandas as pd
import datetime
import warnings


def read_excel_file(filename):
    """Gets a filename of an excel file that has an xls format and returns
        a merged dataframe with all sheets together

    Parameters
    ----------
    filename : str
        The full path of the excel file

    Returns
    -------
    DataFrame
        A dataframe that has all the sheets of the excel file in the format needed
    """
    warnings.filterwarnings('ignore')

    dfs_dict = pd.read_excel(filename, sheet_name=None, engine='xlrd')
    print("The number of sheets in this excel file is", len(dfs_dict))
    print(dfs_dict.keys())
    prev_df = pd.DataFrame()
    result = pd.DataFrame()
    for df in dfs_dict.values():
        df.columns = df.iloc[1]
        df = df.rename(columns={'Time':'DateTime'})

        
        for i in range(2, len(df['DateTime'])):
            if type(df['DateTime'][i]) is datetime.datetime:
                df['DateTime'][i] = datetime.datetime.combine(df['DateTime'][0].date(), df['DateTime'][i].time())
        
        # A line of code that does not work for now
        #df['DateTime'] = df.apply(lambda temp_df: merge_date_and_time(temp_df, temp_df['DateTime'][0].date()), axis=1)
        df = df.iloc[2:26, 1:45]
        df = df.dropna(axis=1)
        
        result = pd.concat([prev_df, df], ignore_index=True)
        prev_df = result

    result = result.rename(columns={'Time':'DateTime'})
    result['Holiday'] = result.apply(lambda temp_df: holiday_func(temp_df), axis=1)
    result['Weekday'] = result.apply(lambda temp_df: weekday_func(temp_df), axis=1)
    return result
    
read_excel_file('../DATA/ENERSHARE_DATA _NTUA/Flows 2015 - Hourly - Excel/Hourly-Flows-01_2015.xls')
    

The number of sheets in this excel file is 31
dict_keys(['01.01.2015', '02.01.2015', '03.01.2015', '04.01.2015', '05.01.2015', '06.01.2015', '07.01.2015', '08.01.2015', '09.01.2015', '10.01.2015', '11.01.2015', '12.01.2015', '13.01.2015', '14.01.2015', '15.01.2015', '16.01.2015', '17.01.2015', '18.01.2015', '19.01.2015', '20.01.2015', '21.01.2015', '22.01.2015', '23.01.2015', '24.01.2015', '25.01.2015', '26.01.2015', '27.01.2015', '28.01.2015', '29.01.2015', '30.01.2015', '31.01.2015'])


1,DateTime,AGIA TRIADA,SIDIROKASTRO,KIPI,AGIOI THEODOROI,ALOYMINION,ALOYMINION II,ALOYMINION III,ATHENS,ALEXANDROUPOLIS,...,MOTOR OIL II,XANTHI,OINOFYTA,PLATY,SALFA ANTHOUSSA,SALFA ANO LIOSSIA,SERRES,TRIKALA,Holiday,Weekday
0,2015-01-01 09:00:00,1.86764,2364.744049,782.328269,0.136442,435.094792,0,90.974348,1022.775388,8.875103,...,0,3.111111,32.437839,22.045892,5.065852,6.079022,43.200003,36.495676,True,3
1,2015-01-01 10:00:00,0,2397.37897,512.001224,0.137161,433.903799,0,95.312855,964.888677,8.444974,...,0,0.497778,33.904285,24.193854,3.633519,4.360223,42.844448,41.044135,True,3
2,2015-01-01 11:00:00,219.04617,2438.456,513.323772,0.09216,433.682617,0,95.178879,909.654745,7.999464,...,0,2.186667,33.728165,23.57368,0.048014,0.057616,43.680003,38.414916,True,3
3,2015-01-01 12:00:00,738.346256,2438.654,513.631145,0.093333,434.947283,0,90.691483,946.129489,144.839386,...,0,0.604444,33.200799,22.287748,1.707938,2.049526,44.302226,35.512136,True,3
4,2015-01-01 13:00:00,715.792472,2462.358,512.974499,0.146365,436.569289,0,90.083176,1090.81902,7.070876,...,0,2.613334,33.894014,20.13443,2.710797,3.252957,44.657781,34.584173,True,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
739,2015-01-31 04:00:00,1768.631343,1629.473347,0,0.113794,422.185638,0,82.310059,241.831647,4.383861,...,0,2.648889,109.256497,12.888025,1.263889,1.516667,34.133336,9.195875,False,5
740,2015-01-31 05:00:00,1775.481892,1639.237113,0,1.100983,421.427839,0,83.995363,240.088031,4.874518,...,0,3.324445,93.968533,13.183526,0.405599,0.486719,31.644447,9.100323,False,5
741,2015-01-31 06:00:00,1768.134271,1659.015654,0,2.59893,421.301278,0,82.213748,256.186459,4.955473,...,0,2.88,88.470918,12.934834,6.760807,8.112968,29.155558,10.256304,False,5
742,2015-01-31 07:00:00,1782.075697,1671.575396,0,0.992284,421.592012,0,83.151725,369.370055,4.543445,...,0,2.791111,92.2171,12.941893,4.819058,5.78287,26.951113,11.696423,False,5


### Function that reads all excel files in multiple folders and merges them into a single dataframe and saves it into a csv file

In [4]:
import os
import pandas as pd
import datetime
import warnings


def read_files(folder_path, prefix):
    """Gets as input a directory that contains all excel files
    along with a prefix that every single file folder starts with and
    returns a dataframe that contains all data of all files. It also saves
    the dataframe into a csv file

    Parameters
    ----------
    folder_path : str
        The full path to the directory that contains other directories
        that contain the files we want to load
    
    prefix : str
        A prefix that exists in the beginning of every folder name

    Returns
    -------
    DataFrame
        A dataframe that contains all data of all excel files merged into a single one dataframe
    """

    
    warnings.filterwarnings('ignore')
    
    prev_df = pd.DataFrame()
    result = pd.DataFrame()
    for foldername in os.listdir(folder_path):
        if foldername.startswith(prefix):
            file_path = folder_path
            file_path = os.path.join(folder_path, foldername)
            # Open all excel files from those folders
            print(os.listdir(file_path))
            for filename in os.listdir(file_path):
                temp_filename = os.path.join(file_path, filename)
                if temp_filename.endswith('.xls'):
                    print(temp_filename)
                    dfs_dict = pd.read_excel(temp_filename, engine='xlrd', sheet_name=None, nrows=28)
                    for df in dfs_dict.values():
                        df.columns = df.iloc[1]

                        for i in range(2, len(df['Time'])):
                            if type(df['Time'][i]) is datetime.datetime:
                                df['Time'][i] = datetime.datetime.combine(df['Time'][0].date(), df['Time'][i].time())
                        df = df.iloc[2:26, 1:45]
                        df = df.dropna(axis=1)

                        result = pd.concat([prev_df, df], ignore_index=True)
                        prev_df = result
            #print(result)
    result.sort_values(by='Time', inplace=True)
    result = result.rename(columns={'Time':'DateTime'})
    result['Holiday'] = result.apply(lambda temp_df: holiday_func(temp_df), axis=1)
    result['Weekday'] = result.apply(lambda temp_df: weekday_func(temp_df), axis=1)
    result.to_csv('../EXCEL_TO_CSV/all_data.csv')
    return result
    
read_files('../DATA/ENERSHARE_DATA _NTUA', 'Flows')
    

['Hourly-Flows-09_2014 (1).xls', 'Hourly-Flows-10_2014.xls', 'Hourly-Flows-11_2014.xls', 'Hourly-Flows-12_2014 (1).xls']
../DATA/ENERSHARE_DATA _NTUA\Flows 2014 - Hourly - Excel\Hourly-Flows-09_2014 (1).xls
../DATA/ENERSHARE_DATA _NTUA\Flows 2014 - Hourly - Excel\Hourly-Flows-10_2014.xls
../DATA/ENERSHARE_DATA _NTUA\Flows 2014 - Hourly - Excel\Hourly-Flows-11_2014.xls
../DATA/ENERSHARE_DATA _NTUA\Flows 2014 - Hourly - Excel\Hourly-Flows-12_2014 (1).xls
['Hourly-Flows-01_2015.xls', 'Hourly-Flows-02_2015.xls', 'Hourly-Flows-03_20151.xls', 'Hourly-Flows-04_2015.xls', 'Hourly-Flows-05_2015.xls', 'Hourly-Flows-06_2015.xls', 'Hourly-Flows-07_2015.xls', 'Hourly-Flows-08_2015.xls', 'Hourly-Flows-09_20153.xls', 'Hourly-Flows-10_2015.xls', 'Hourly-Flows-11_2015.xls', 'Hourly-Flows-12_2015.xls']
../DATA/ENERSHARE_DATA _NTUA\Flows 2015 - Hourly - Excel\Hourly-Flows-01_2015.xls
../DATA/ENERSHARE_DATA _NTUA\Flows 2015 - Hourly - Excel\Hourly-Flows-02_2015.xls
../DATA/ENERSHARE_DATA _NTUA\Flows 2015 

1,DateTime,AGIA TRIADA,SIDIROKASTRO,KIPI,AGIOI THEODOROI,ALOYMINION,ALOYMINION II,ALOYMINION III,ATHENS,ALEXANDROUPOLIS,...,OINOFYTA,PLATY,SALFA ANTHOUSSA,SALFA ANO LIOSSIA,SERRES,TRIKALA,MEGALOPOLIS\n(PPC),ELPE - HAR,Holiday,Weekday
15,2014-09-01 00:00:00,0,1968.17,1081.714747,1.006238,386.378384,0,86.990502,107.718607,0,...,90.74072,9.586922,24.889061,28.62235,12.800001,0,,,False,0
16,2014-09-01 01:00:00,0,1970.4,1085.089518,1.425448,88.890634,0,85.540484,97.470909,0,...,80.107956,9.426204,16.216738,28.64955,9.813334,0,,,False,0
17,2014-09-01 02:00:00,0,1976.5,1087.033615,1.22682,76.079663,0,82.496048,85.741823,0,...,98.61475,9.655958,23.750735,27.31365,1.884445,0,,,False,0
18,2014-09-01 03:00:00,0,1981.65,1087.900093,1.236243,73.233812,0,82.94189,82.081179,0,...,102.719525,9.697519,1.927019,5.21605,1.457778,0,,,False,0
19,2014-09-01 04:00:00,0,1990.28,1088.613414,0.997471,71.086599,0,86.595267,84.646194,0,...,107.171301,9.562005,0.005786,6.0069,1.528889,0,,,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33587,2018-07-31 19:00:00,2752391.659804,2860324.203805,0,2953.968267,415547.333795,746905.805378,78549.984372,140347.920109,2647.115778,...,119193.826475,8751.307257,1817.815664,1233.173055,,,0,128525.05748,False,1
33588,2018-07-31 20:00:00,2740818.523473,2863400.16038,0,1094.547602,415838.181345,753553.1661,82596.367186,137494.563884,3443.724987,...,124396.990087,8716.139199,10920.816564,12353.928608,,,0,133812.508025,False,1
33589,2018-07-31 21:00:00,2742809.043366,2874768.10611,0,2347.247799,416212.683834,687621.411863,81791.453126,136852.089336,2145.741484,...,128303.88295,9186.457855,11835.489013,37350.648576,,,0,133290.66414,False,1
33590,2018-07-31 22:00:00,2744017.276039,2890264.961532,0,3245.851321,416959.621486,751043.586778,84686.156251,140395.434503,1675.95602,...,129413.313672,8803.363411,11526.262876,25002.374476,,,0,131403.796398,False,1
