### Imports:

In [1]:
import pandas as pd
import os
import time
from astropy.io import fits

### Extraction Loop:
The light curve data in the .fit files is contained in a list of tuples along with other measurements. <br>
Each tuple is an observation point for a specific time, the observations are taken 30min apart. <br>
We want the 3rd value in each tuple, which corresponds to the corrected flux levels recorded by the instruments in the spacecraft in electrons per second. <br>
#### This loop extracts this data from each .fit file, compiles it into a dataframe, and saves the dataframe as .csv file.

In [2]:
### If extracting from confirmed_planets you must set to item[7] when iterating through items in the .fit data
#     item[2] for k2c1
directory = '../raw_space_data/confirmed_planets/'
save_name = 'extracted_planets_1_again'

In [3]:
start_time = time.time()

# The following lines will prepare things to be run through the loop
col_names = ['star_name', 'exposure']
df = pd.DataFrame(columns = [col_names])
i = 0

for filename in os.listdir(directory): # Loop through every .fit file in the directory
#     if i > 5:
#         break
    if filename.endswith(".fits"):
   
        fit_data = fits.open(directory + filename)
        if fit_data[1].header['EXPOSURE'] < 65: # if the exposure is less than 65 days, skip this file
            continue
        df.loc[i, 'star_name'] = fit_data[1].header['OBJECT']    # Extract the star's name
        df.loc[i, 'exposure'] = fit_data[1].header['EXPOSURE']   # Extract the amount of time Kepler was recording the target
        
        j = 0 
        for item in fit_data[1].data:     # iterate through the items in the .fit data 
            col_name = 'c_flux_' + str(j) # create the column label
            df.loc[i, col_name] = item[7] # item[7] for confirmed planets, item[2] for k2c1, extract the corrected flux levels for each item and put it all in a row
            
            j += 1
            if j > 4000: # We won't be needing more columns than this for this project
                break
        
        if i % 100 == 0: # save the progress and print out some feedback every 100 files
            df.to_csv(('../' + save_name + '.csv'), index=False)
            print('Rows saved: ', i, ' ', round((time.time() - start_time)/60, 2), ' min runtime.')
        
        i += 1

# Done!
df.to_csv(('../' + save_name + '.csv'), index=False)
print('DONE!!!!!', 'Runtime: ', round((time.time() - start_time)/60, 3), ' minutes')

Rows saved:  0   0.21  min runtime.
Rows saved:  100   8.15  min runtime.
Rows saved:  200   16.13  min runtime.
Rows saved:  300   24.15  min runtime.
Rows saved:  400   32.2  min runtime.
Rows saved:  500   40.3  min runtime.
Rows saved:  600   48.48  min runtime.
Rows saved:  700   56.67  min runtime.
Rows saved:  800   64.92  min runtime.
Rows saved:  900   73.19  min runtime.
Rows saved:  1000   81.53  min runtime.
Rows saved:  1100   90.07  min runtime.
Rows saved:  1200   99.36  min runtime.
Rows saved:  1300   108.8  min runtime.
Rows saved:  1400   118.23  min runtime.
Rows saved:  1500   127.25  min runtime.
Rows saved:  1600   135.87  min runtime.
Rows saved:  1700   144.54  min runtime.
Rows saved:  1800   153.23  min runtime.
Rows saved:  1900   161.99  min runtime.
Rows saved:  2000   170.8  min runtime.
Rows saved:  2100   179.61  min runtime.
Rows saved:  2200   188.49  min runtime.
Rows saved:  2300   197.43  min runtime.
Rows saved:  2400   206.38  min runtime.
Rows s

### Check the Dataframe to be Sure Everything Worked:
There should be some null values at the end of most of the rows because not all the light curves are the same length.

In [4]:
df.tail()

Unnamed: 0,star_name,exposure,c_flux_0,c_flux_1,c_flux_2,c_flux_3,c_flux_4,c_flux_5,c_flux_6,c_flux_7,...,c_flux_3991,c_flux_3992,c_flux_3993,c_flux_3994,c_flux_3995,c_flux_3996,c_flux_3997,c_flux_3998,c_flux_3999,c_flux_4000
5539,KIC 12254909,87.1767,,63255.378906,63257.652344,63256.613281,63253.457031,63237.640625,63258.542969,63227.269531,...,63250.996094,63257.761719,63247.835938,63242.769531,63248.015625,63255.644531,63257.492188,63246.757812,63256.394531,63269.320312
5540,KIC 12254909,82.7348,,64496.191406,64523.871094,64547.769531,64535.445312,64530.316406,64536.335938,64543.359375,...,64543.347656,64527.554688,64546.824219,64526.988281,64534.796875,64529.234375,64538.226562,64541.921875,64554.035156,64544.078125
5541,KIC 12254909,82.2996,,65214.257812,65212.6875,65196.605469,65208.125,65210.011719,65210.316406,65210.0625,...,65225.414062,65205.414062,65248.800781,65213.429688,65219.933594,65212.007812,65189.375,65219.539062,65217.109375,65215.449219
5542,KIC 12254909,89.6976,,62412.324219,62415.324219,62406.960938,62418.777344,62413.421875,62402.933594,62396.136719,...,62410.253906,62405.71875,62409.714844,62435.546875,62415.730469,62413.988281,62419.355469,62431.804688,62427.863281,62435.519531
5543,KIC 12254909,86.027,,65026.0,65048.792969,65063.957031,65058.011719,65044.015625,65051.265625,65057.699219,...,65037.945312,65036.355469,65036.5,65048.902344,65040.390625,65034.003906,65040.039062,65036.71875,65033.417969,65043.886719


### Save a Backup File.
Just in case...

In [5]:
df.to_csv(('../' + save_name + '_backup.csv'), index=False)

Check to see that the file can be read:

In [6]:
# clean = pd.read_csv('../clean_confirmed_planets_part2.csv')
# clean.head()

### Great! Everything looks good!

In [7]:
# fit_check = fits.open('../raw_space_data/confirmed_planets/' + 'kplr010010440-2013098041711_llc.fits')


In [8]:
# data_check = fit_check[1].data
# fit_check[1].header

In [9]:
# data_check

In [10]:
# checking = []
# for i in range(len(data_check)):
#     checking.append(data_check[i][7])

In [11]:
# plt.scatter(range(len(data_check)),checking)
# # plt.ylim(33500, 33600)

In [12]:
# import matplotlib.pyplot as plt

# %matplotlib inline

In [13]:
# checking