### Imports:

In [1]:
import pandas as pd
import os
import time
from astropy.io import fits

### Extractioning the Light Curve:
The light curve data in the .fit files is contained in a list of tuples along with other measurements. <br>
Each tuple is an observation point for a specific time, the observations are taken 30min apart. <br>
We want the 3rd value in each tuple, which corresponds to the corrected flux levels recorded by the instruments in the spacecraft in electrons per second. <br>
#### This loop extracts this data from each .fit file, compiles it into a dataframe, and saves the dataframe as .csv file.

In [2]:
### If extracting from confirmed_planets you must set to item[7] when iterating through items in the .fit data
#     item[2] for k2c1
directory = '../raw_space_data/Kepler_Q4/'
save_name = 'extracted_kep_c4'

In [None]:
start_time = time.time()

# The following lines will prepare things to be run through the loop
col_names = ['star_name', 'exposure']
df = pd.DataFrame(columns = [col_names])
i = 0

for filename in os.listdir(directory): # Loop through every .fit file in the directory
#     if i > 5:
#         break
    if filename.endswith(".fits"):
   
        fit_data = fits.open(directory + filename)
        if fit_data[1].header['EXPOSURE'] < 65: # if the exposure is less than 65 days, skip this file
            continue
        df.loc[i, 'star_name'] = fit_data[1].header['OBJECT']    # Extract the star's name
        df.loc[i, 'exposure'] = fit_data[1].header['EXPOSURE']   # Extract the amount of time Kepler was recording the target
        
        j = 0 
        for item in fit_data[1].data:     # iterate through the items in the .fit data 
            col_name = 'c_flux_' + str(j) # create the column label
            df.loc[i, col_name] = item[7] # item[7] for confirmed planets, item[2] for k2c1, extract the corrected flux levels for each item and put it all in a row
            
            j += 1
#             if j > 4000: # We won't be needing more columns than this for this project
#                 break
        
        if i % 100 == 0: # save the progress and print out some feedback every 100 files
            df.to_csv(('../' + save_name + '.csv'), index=False)
            print('Rows saved: ', i, ' ', round((time.time() - start_time)/60, 2), ' min runtime.')
        
        i += 1

# Done!
df.to_csv(('../' + save_name + '.csv'), index=False)
print('DONE!!!!!', 'Runtime: ', round((time.time() - start_time)/60, 3), ' minutes')

Rows saved:  0   0.23  min runtime.
Rows saved:  100   9.39  min runtime.
Rows saved:  200   18.35  min runtime.
Rows saved:  300   27.11  min runtime.
Rows saved:  400   36.67  min runtime.
Rows saved:  500   46.87  min runtime.
Rows saved:  600   57.05  min runtime.


### Check the Dataframe to be Sure Everything Worked:
There should be some null values at the end of most of the rows because not all the light curves are the same length.

In [None]:
df.tail()

### Save a Backup File.
Just in case...

In [None]:
df.to_csv(('../' + save_name + '_backup.csv'), index=False)

Check to see that the file can be read:

In [None]:
# clean = pd.read_csv('../clean_confirmed_planets_part2.csv')
# clean.head()

### Great! Everything looks good!

In [None]:
fit_check = fits.open(directory + 'kplr010000056-2010078095331_llc.fits')


In [None]:
data_check = fit_check[1].data
fit_check[1].header

In [None]:
data_check

In [None]:
checking = []
for i in range(len(data_check)):
    checking.append(data_check[i][7])

In [None]:
plt.scatter(range(len(data_check)),checking)
# plt.ylim(335900, 336250)

In [None]:
import matplotlib.pyplot as plt

%matplotlib inline

In [None]:
# checking