### Goals in this Notebook:
Make new files where the data is: 
> Clean, without null values. <br>
> Labeled as detectable planet around star (1) or not (0). <br>
> Set to the same time frame. <br>

### Imports:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np
import random

%matplotlib inline

### Read in the Files:

In [2]:
# These are the confirmed planet stars from the first download session
c_planets = pd.read_csv('../clean_planet_data/extracted_planets_1_again.csv')

# These are the confirmed planet stars from the second download session
c_planets_2 = pd.read_csv('../clean_planet_data/extracted_confirmed_planets_2_again.csv')

c4_kep = pd.read_csv('../clean_planet_data/extracted_kep_c4_7700_backup.csv')

In [3]:
# drop the last line of c4_kep because it only downloaded halfway before being stopped
c4_kep.drop(index=7713, inplace = True)

# Start Munging:

### Randomly Select Lightcurves from the Duplicates in Confirmed Planets Set:
In the confirmed planets set, there are several light curves for each star. <br>
Randomly selecting the light curve from these will prevent counfounding variables from influencing the model. 

In [17]:
# combine both sets containing confirmed planets
combined_planets = pd.concat([c_planets, c_planets_2])

In [33]:
randomized_planets = pd.DataFrame(columns = c_planets.columns)

i = 0

# Randomly selecting each star's light curves from various time frames
for star in combined_planets['star_name'].unique():
    
    # Print out some feedback to show progress
    if i % 250 == 0:
        print(i)    
    i += 1

    # Randomly select light curve and add it to randomized_planets
    same_star = combined_planets[combined_planets['star_name'] == star] # select all rows whose stars have the same name
    rand_select = random.choice(same_star.index) # randomly select one of the index numbers
    randomized_planets = pd.concat([randomized_planets, same_star[same_star.index == rand_select]]) # add it to the new df

# Reset the index
randomized_planets.reset_index(drop = True, inplace = True)    
print('Finished!')

250
500
750
1000
1250


# Dealing with Nulls:

### Calculate Isolated Missing Values:
Fill 'one-off' missing values with mean imputation of the nearest two values.

In [42]:
for df in [c4_kep, randomized_planets]: # for each dataset
    
    is_null = df.isnull() # make a df of which values are null or not

    for i in range(df.shape[0]): # for each row

        # Print out some feedback to show progress
        if i % 500 == 0:
            print(i)

        for j in range(df.shape[1]-1): # for each column
            if j > 2: # skip the first three columns

                if is_null.iloc[i, j] == True: # if the cell is null
                    if not ((is_null.iloc[i, j-1] == True) | (is_null.iloc[i, j+1] == True)): # and if the adjacent cells are non-null
                        df.iloc[i, j] = np.mean([df.iloc[i, j-1], df.iloc[i, j+1]]) # assign the cell to be the mean of the adjacent cells

0
250
500
750
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500
4750
5000
5250
5500
5750
6000
6250
6500
6750
7000
7250
7500
0
250
500
750
1000
1250


### Closing Gaps in Data:

In [45]:
c4_kep.head()

Unnamed: 0,star_name,exposure,c_flux_0,c_flux_1,c_flux_2,c_flux_3,c_flux_4,c_flux_5,c_flux_6,c_flux_7,...,c_flux_4387,c_flux_4388,c_flux_4389,c_flux_4390,c_flux_4391,c_flux_4392,c_flux_4393,c_flux_4394,c_flux_4395,c_flux_4396
0,KIC 1025494,82.716241,,263338.2,263343.7,263349.3,263288.3,263403.4,263315.3,263307.6,...,263326.2,263309.6,263345.0,263305.5,263307.6,263341.2,263333.6,263333.2,263317.6,263352.6
1,KIC 1025578,82.716237,,12923.38,12920.32,12917.26,12913.29,12921.88,12922.11,12911.57,...,12912.79,12917.49,12924.33,12921.21,12911.19,12916.99,12917.56,12932.14,12931.86,12908.6
2,KIC 1025986,82.716227,,1276864.0,1277048.0,1277233.0,1277410.0,1277653.0,1277849.0,1278068.0,...,1272619.0,1273011.0,1273417.0,1273826.0,1274113.0,1274493.0,1274748.0,1275238.0,1275759.0,1276052.0
3,KIC 1026032,82.716228,,17436.18,17432.69,17429.21,17433.79,17430.04,17431.59,17427.26,...,17392.45,17395.75,17392.41,17405.89,17397.01,17388.93,17400.34,17393.58,17389.54,17385.59
4,KIC 1026133,82.716226,,80894.27,80898.23,80902.19,80913.89,80886.92,80895.93,80875.95,...,80906.45,80924.13,80899.6,80911.61,80901.57,80900.39,80929.84,80914.38,80914.08,80908.27


In [46]:
for df in [randomized_planets]: #c4_kep # for both datasets
#     df_count = 1 
    
    # Shifting values to fill nulls
    is_null = df.isnull()
    df_squished = pd.DataFrame()

    for i in range(df.shape[0]): # for each row
        
        # print out some feedback to show progress
        if i % 500 == 0:
            print(i)

        k = 0 # reset the df_squished column index to 0 for each new row
        
        for j in range(df.shape[1]): # for each column
            
            if is_null.iloc[i, j] == False: # if this cell is not null
                df_squished.loc[i, k] = df.iloc[i, j] # add the cell to df_squished
                k += 1
    
#     # save df_squished to different variables for each dataset
#     if df_count = 1: # if this is the c4_kep set
#         c4_kep_squished = df_squished
#     else: # for the randomized_planets set
randomized_planets_squished = df_squished
#     df_count += 1

0
500
1000


In [None]:
# Do this for c4_kep and confirmed planets

# fix this: the column names will no longer be relevent


# Assign Labels

### Assign Labels to Stars with Planets:

In [None]:
# Bring in a table that lists all confirmed planets with their star names and other info
all_confirmed = pd.read_csv('../clean_planet_data/all_planets_list.csv')

In [None]:
# Assign labels to c4_kep
not_found = 0
stars_to_drop = []

for j in range(len(c4_kep)): # for every light curve
    if j % 250 == 0:
        print(j)

    count = 0
    for i in range(len(all_confirmed)): # look through each star name in the list of all confirmed planets
        try:
            if all_confirmed.loc[i, 'Alternative star names'].find(c4_kep.iloc[j, 0]) != -1:
                count += 1
                print(c4_kep.iloc[j, 0], ' found @ index: ', j, 'orbital period: ', all_confirmed.loc[i, 'Orbital period [days]'])
                c4_kep.loc[j, '1'] = 1
    
        except AttributeError: # if the alternate star names value are null
            try:
                if all_confirmed.loc[i, 'Star name'].find(c4_kep.iloc[j, 0]) != -1:
                    count += 1
                    print(c4_kep.iloc[j, 0], ' found @ index: ', j, 'on 2nd level of loop', 'orbital period: ', all_confirmed.loc[i, 'Orbital period [days]'])
                    c4_kep.loc[j, '1'] = 1
                    
            except AttributeError: # if this is null too, keep going. There are few of these cases in the set
                continue
                
    if count == 0:
        not_found += 1

In [None]:
# change label on confirmed stars with no planets under the timeframe we're looking at
#     add a 'detectable' column?

# drop stars from confirmed planets that cannot be found? how many are there?, can I get this data somewhere else?

### Make Detectable Planets Label for Confirmed Planets Set:
Label should only be positive if the planet has a detectable orbitable period.

In [None]:
# clean all_confirmed planets obrital period column to be usable (numeric and no weird symbols)

# for each confirmed planet star
#     search for it in all_confirmed
#     if there is no planet with that star name with an orbit less than the detectable period
#         drop it

#     if it can't be found in all_confirmed
#         drop it and tally how many of these there are

### Set the Light Curves to the Same Time Frame:
That way there are no nulls and we can compare all the light curves from all datasets.

In [None]:
# max row length should be the number of nonmissing values in the shortest clean light curve
# make sure the feature names are consistent and usable

### Save to a New File:

In [None]:
# df_squished.to_csv('../clean_planet_data/clean_labeled_kep_c4.csv', index=False)

In [None]:
# least_null_planets.to_csv('../clean_planet_data/clean_labeled_planets.csv', index=False)

### Done!