### Goals in this Notebook:
Make new files where the data is: 
> Clean, without null values. <br>
> Labeled as detectable planet around star (1) or not (0). <br>
> Set to the same time frame. <br>

### Imports:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np
import random

%matplotlib inline

### Read in the Files:

In [2]:
# These are the confirmed planet stars from the first download session
c_planets = pd.read_csv('../clean_planet_data/extracted_planets_1_again.csv')

# These are the confirmed planet stars from the second download session
c_planets_2 = pd.read_csv('../clean_planet_data/extracted_confirmed_planets_2_again.csv')

c4_kep = pd.read_csv('../clean_planet_data/extracted_kep_c4_7700_backup.csv')

In [3]:
# drop the last line of c4_kep because it only downloaded halfway before being stopped
c4_kep.drop(index=7713, inplace = True)

# Start Munging:

### Randomly Select Lightcurves from the Duplicates in Confirmed Planets Set:
In the confirmed planets set, there are several light curves for each star. <br>
Randomly selecting the light curve from these will prevent counfounding variables from influencing the model. 

In [4]:
# combine both sets containing confirmed planets
combined_planets = pd.concat([c_planets, c_planets_2])

In [5]:
np.random.seed(112) # set random seed

randomized_planets = pd.DataFrame(columns = c_planets.columns)

i = 0

# Randomly selecting each star's light curves from various time frames
for star in combined_planets['star_name'].unique():
    
    # Print out some feedback to show progress
    if i % 250 == 0:
        print(i)    
    i += 1

    # Randomly select light curve and add it to randomized_planets
    same_star = combined_planets[combined_planets['star_name'] == star] # select all rows whose stars have the same name
    rand_select = random.choice(same_star.index) # randomly select one of the index numbers
    randomized_planets = pd.concat([randomized_planets, same_star[same_star.index == rand_select]]) # add it to the new df

# Reset the index
randomized_planets.reset_index(drop = True, inplace = True)    
print('Finished!')

0
250
500
750
1000
1250
Finished!


# Dealing with Nulls:

### Calculate Isolated Missing Values:
Fill 'one-off' missing values with mean imputation of the nearest two values.

In [6]:
for df in [c4_kep, randomized_planets]: # for each dataset
    
    is_null = df.isnull() # make a df of which values are null or not

    for i in range(df.shape[0]): # for each row

        # Print out some feedback to show progress
        if i % 500 == 0:
            print(i)

        for j in range(df.shape[1]-1): # for each column
            if j > 2: # skip the first three columns

                if is_null.iloc[i, j] == True: # if the cell is null
                    if not ((is_null.iloc[i, j-1] == True) | (is_null.iloc[i, j+1] == True)): # and if the adjacent cells are non-null
                        df.iloc[i, j] = np.mean([df.iloc[i, j-1], df.iloc[i, j+1]]) # assign the cell to be the mean of the adjacent cells

0
500
1000
1500
2000
2500
3000
3500
4000
4500
5000
5500
6000
6500
7000
7500
0
500
1000


### Closing Gaps in Data:

In [7]:
c4_kep.head()

Unnamed: 0,star_name,exposure,c_flux_0,c_flux_1,c_flux_2,c_flux_3,c_flux_4,c_flux_5,c_flux_6,c_flux_7,...,c_flux_4387,c_flux_4388,c_flux_4389,c_flux_4390,c_flux_4391,c_flux_4392,c_flux_4393,c_flux_4394,c_flux_4395,c_flux_4396
0,KIC 1025494,82.716241,,263338.2,263343.7,263349.3,263288.3,263403.4,263315.3,263307.6,...,263326.2,263309.6,263345.0,263305.5,263307.6,263341.2,263333.6,263333.2,263317.6,263352.6
1,KIC 1025578,82.716237,,12923.38,12920.32,12917.26,12913.29,12921.88,12922.11,12911.57,...,12912.79,12917.49,12924.33,12921.21,12911.19,12916.99,12917.56,12932.14,12931.86,12908.6
2,KIC 1025986,82.716227,,1276864.0,1277048.0,1277233.0,1277410.0,1277653.0,1277849.0,1278068.0,...,1272619.0,1273011.0,1273417.0,1273826.0,1274113.0,1274493.0,1274748.0,1275238.0,1275759.0,1276052.0
3,KIC 1026032,82.716228,,17436.18,17432.69,17429.21,17433.79,17430.04,17431.59,17427.26,...,17392.45,17395.75,17392.41,17405.89,17397.01,17388.93,17400.34,17393.58,17389.54,17385.59
4,KIC 1026133,82.716226,,80894.27,80898.23,80902.19,80913.89,80886.92,80895.93,80875.95,...,80906.45,80924.13,80899.6,80911.61,80901.57,80900.39,80929.84,80914.38,80914.08,80908.27


In [8]:
randomized_planets

Unnamed: 0,star_name,exposure,c_flux_0,c_flux_1,c_flux_2,c_flux_3,c_flux_4,c_flux_5,c_flux_6,c_flux_7,...,c_flux_3991,c_flux_3992,c_flux_3993,c_flux_3994,c_flux_3995,c_flux_3996,c_flux_3997,c_flux_3998,c_flux_3999,c_flux_4000
0,KIC 10000941,81.906679,,,,,,,,,...,52522.144531,52541.027344,52551.097656,52578.242188,52583.531250,52588.527344,52617.773438,52577.300781,52610.238281,52614.976562
1,KIC 10001368,86.026673,,34643.652344,34646.644531,34638.632812,34626.734375,34646.964844,34647.710938,34636.976562,...,34636.105469,34637.558594,34636.636719,34638.433594,34648.359375,34631.878906,34632.152344,34636.234375,34634.910156,34636.464844
2,KIC 10001893,89.917847,,6493.119629,6486.426270,6498.390625,6483.745605,6494.564941,6502.554688,6481.802734,...,6505.922852,6487.660645,6496.463867,6483.562988,6501.854980,6492.566406,6493.122070,6493.307129,6494.298340,6494.553223
3,KIC 10002866,89.488276,,13695.596680,13706.898438,13703.199219,13698.659180,13699.879883,13697.466797,13701.559570,...,13776.031250,13773.433594,13783.834961,13777.468750,13784.899414,13771.859375,13781.013672,13775.145508,13768.628906,13767.840820
4,KIC 10004519,89.428692,,9705.454102,9704.304688,9704.225586,9721.155273,9710.652344,9711.234375,9708.650391,...,9704.010742,9705.146484,9709.726562,9711.235352,9710.955078,9710.334961,9713.541992,9703.168945,9705.213867,9708.651367
5,KIC 10004738,79.066551,,29388.349609,29394.541016,29388.070312,29391.054688,29384.857422,29394.859375,29387.296875,...,29392.423828,29389.343750,29399.916016,29393.535156,29394.498047,29388.357422,29388.027344,29388.039062,29384.507812,29392.087891
6,KIC 10005788,76.075332,,10604.299805,10596.361328,10600.237305,10595.707031,10603.730469,10598.882812,10589.682617,...,10553.635742,10551.680664,10556.650391,10557.738281,10548.358398,10560.642578,10562.210938,10558.595703,10563.617188,10558.837891
7,KIC 10006581,89.698048,,16854.275391,16865.250000,16864.521484,16866.705078,16866.515625,16863.703125,16858.732422,...,16859.451172,16854.666016,16861.060547,16851.314453,16851.390625,16857.978516,16866.607422,16854.626953,16853.351562,16860.679688
8,KIC 10010440,83.170176,,15554.692383,15560.917969,15571.168945,15557.334961,15557.336914,15562.561523,15567.510742,...,15545.677734,15551.957031,15554.178711,15560.767578,15553.257812,15548.775391,15548.677734,15550.393555,15555.295898,15547.943359
9,KIC 10018233,87.177304,,11821.462891,11821.990234,11812.525391,11825.382812,11842.602539,11823.055664,11817.125000,...,11815.748047,11828.895508,11826.475586,11826.976562,11820.697266,11826.281250,11824.744141,11835.597656,11826.556641,11831.149414


In [9]:
randomized_planets.shape

(1265, 4003)

In [10]:
for df in [randomized_planets]: #c4_kep # for both datasets
#     df_count = 1 
    
    # Shifting values to fill nulls
    is_null = df.isnull()
    df_squished = pd.DataFrame()

    for i in range(df.shape[0]): # for each row
        
        # print out some feedback to show progress
        if i % 100 == 0:
            print(i)

        k = 0 # reset the df_squished column index to 0 for each new row
        
        for j in range(df.shape[1]): # for each column
            
            if is_null.iloc[i, j] == False: # if this cell is not null
                df_squished.loc[i, k] = df.iloc[i, j] # add the cell to df_squished
                k += 1
    
#     # save df_squished to different variables for each dataset
#     if df_count = 1: # if this is the c4_kep set
#         c4_kep_squished = df_squished
#     else: # for the randomized_planets set
randomized_planets_squished = df_squished
#     df_count += 1

0
100
200
300
400
500
600
700
800
900
1000
1100
1200


In [11]:
# Do this for c4_kep and confirmed planets

# fix this: the column names will no longer be relevent
check_it_out = randomized_planets_squished.head(1267).iloc[:,:3199]
check_it_out

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3189,3190,3191,3192,3193,3194,3195,3196,3197,3198
0,KIC 10000941,81.906679,51606.843750,51589.371094,51586.289062,51589.476562,51585.058594,51598.816406,51587.203125,51583.257812,...,52366.425781,52349.691406,52348.902344,52362.187500,52372.296875,52386.378906,52390.582031,52401.617188,52398.761719,52397.660156
1,KIC 10001368,86.026673,34643.652344,34646.644531,34638.632812,34626.734375,34646.964844,34647.710938,34636.976562,34651.855469,...,34653.347656,34639.554688,34640.605469,34652.585938,34642.792969,34643.730469,34639.578125,34635.425781,34643.242188,34638.511719
2,KIC 10001893,89.917847,6493.119629,6486.426270,6498.390625,6483.745605,6494.564941,6502.554688,6481.802734,6500.023438,...,6502.413086,6494.500000,6491.939453,6500.797852,6484.349121,6506.081543,6492.395508,6481.608398,6489.268555,6500.530762
3,KIC 10002866,89.488276,13695.596680,13706.898438,13703.199219,13698.659180,13699.879883,13697.466797,13701.559570,13696.207031,...,13659.816406,13660.325195,13664.023438,13652.503906,13662.452148,13658.985352,13663.659180,13660.121094,13663.482422,13658.893555
4,KIC 10004519,89.428692,9705.454102,9704.304688,9704.225586,9721.155273,9710.652344,9711.234375,9708.650391,9713.518555,...,9708.580078,9705.692383,9709.043945,9709.913086,9714.366211,9703.272461,9700.703125,9709.462891,9708.046875,9709.661133
5,KIC 10004738,79.066551,29388.349609,29394.541016,29388.070312,29391.054688,29384.857422,29394.859375,29387.296875,29386.708984,...,29380.410156,29384.673828,29397.039062,29386.798828,29393.279297,29380.443359,29385.664062,29392.171875,29393.054688,29379.460938
6,KIC 10005788,76.075332,10604.299805,10596.361328,10600.237305,10595.707031,10603.730469,10598.882812,10589.682617,10595.082031,...,10578.530273,10577.953125,10579.473633,10577.183594,10578.195312,10583.036133,10572.682617,10583.517578,10577.040039,10575.938477
7,KIC 10006581,89.698048,16854.275391,16865.250000,16864.521484,16866.705078,16866.515625,16863.703125,16858.732422,16861.509766,...,16861.597656,16860.066406,16858.835938,16860.845703,16863.000000,16872.173828,16866.130859,16873.837891,16866.992188,16862.892578
8,KIC 10010440,83.170176,15554.692383,15560.917969,15571.168945,15557.334961,15557.336914,15562.561523,15567.510742,15560.070312,...,15559.992188,15565.981445,15560.169922,15555.788086,15564.140625,15561.482422,15562.884766,15568.705078,15565.151367,15557.879883
9,KIC 10018233,87.177304,11821.462891,11821.990234,11812.525391,11825.382812,11842.602539,11823.055664,11817.125000,11824.669922,...,11828.780273,11819.432617,11824.672852,11828.072266,11828.934570,11828.065430,11820.541016,11835.219727,11834.395508,11825.008789


# Assign Labels

### Assign Labels to Stars with Planets:

In [None]:
# Bring in a table that lists all confirmed planets with their star names and other info
all_confirmed = pd.read_csv('../clean_planet_data/all_planets_list.csv')

In [None]:
# Assign labels to c4_kep
not_found = 0

for j in range(len(c4_kep)): # for every light curve
    if j % 250 == 0:
        print(j)

    count = 0
    for i in range(len(all_confirmed)): # look through each star name in the list of all confirmed planets
        try:
            if all_confirmed.loc[i, 'Alternative star names'].find(c4_kep.iloc[j, 0]) != -1:
                count += 1
                print(c4_kep.iloc[j, 0], ' found @ index: ', j, 'orbital period: ', all_confirmed.loc[i, 'Orbital period [days]'])
                c4_kep.loc[j, '1'] = 1
    
        except AttributeError: # if the alternate star names value are null
            try:
                if all_confirmed.loc[i, 'Star name'].find(c4_kep.iloc[j, 0]) != -1:
                    count += 1
                    print(c4_kep.iloc[j, 0], ' found @ index: ', j, 'on 2nd level of loop', 'orbital period: ', all_confirmed.loc[i, 'Orbital period [days]'])
                    c4_kep.loc[j, '1'] = 1
                    
            except AttributeError: # if this is null too, keep going. There are few of these cases in the set
                continue
                
    if count == 0:
        not_found += 1

In [None]:
# change label on confirmed stars with no planets under the timeframe we're looking at
#     add a 'detectable' column?

# drop stars from confirmed planets that cannot be found? how many are there?, can I get this data somewhere else?

### Make Detectable Planets Label for Confirmed Planets Set:
Label should only be positive if the planet has a detectable orbitable period.

In [None]:
for item in all_confirmed['Orbital period [days]']:
    
    item = str(item)
    clean_orbit = ''
    for letter in item:
        
        if letter in ['.', '+', '-', '±']:
            break
        elif letter in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
            clean_orbit = clean_orbit + letter
            
    if clean_orbit != '':
        clean_orbit = float(clean_orbit)
        
        print(clean_orbit, '-----', item)

In [None]:
testing = randomized_planets_squished

In [None]:
testing[1] = 0 # change what used to be the exposure column to contain labels of 0 (for now)

In [None]:
testing.head(1)

In [None]:
not_found = 0

for j in range(len(testing)): # for every light curve
    if j % 250 == 0:
        print(j)

    count = 0 # keep track of if we find it in confirmed planets list
    
    for i in range(len(all_confirmed)): # look through each star name in the list of all confirmed planets
        try:
            if all_confirmed.loc[i, 'Alternative star names'].find(testing.iloc[j, 0]) != -1:
                count += 1
                
                # make sure it has a detectable orbit
                orbit = all_confirmed.loc[i, 'Orbital period [days]']
                
                item = str(orbit)
                clean_orbit = ''
                for letter in item:

                    if letter in ['.', '+', '-', '±']:
                        break
                    elif letter in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
                        clean_orbit = clean_orbit + letter

                if clean_orbit != '': # the following code will throw an error if the orbit was null
                    clean_orbit = float(clean_orbit)

                    if clean_orbit < 66: # if this is a detectable planet set label to 1
                        testing.loc[j, 1] = 1
                            
        except AttributeError: # if the alternate star names value are null
            try:
                if all_confirmed.loc[i, 'Star name'].find(testing.iloc[j, 0]) != -1:
                    count += 1
                    
                    # make sure it has a detectable orbit
                    orbit = all_confirmed.loc[i, 'Orbital period [days]']

                    item = str(orbit)
                    clean_orbit = ''
                    for letter in item:

                        if letter in ['.', '+', '-', '±']:
                            break
                        elif letter in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
                            clean_orbit = clean_orbit + letter

                    if clean_orbit != '': # the following code will throw an error if the orbit was null
                        clean_orbit = float(clean_orbit)

                        if clean_orbit < 66: # if this is a detectable planet set label to 1
                            testing.loc[j, 1] = 1
                    
            except AttributeError: # if this is null too, keep going.
                continue
                
    # if the star could not be found, drop this row            
#     if count == 0:
#         testing.drop(index = j, inplace=True)

In [None]:
testing[1].value_counts()

In [None]:
labeled_planets = testing[testing[1] == 1]

In [None]:
labeled_planets.reset_index(drop=True, inplace = True)

In [None]:
# labeled_planets.to_csv('../clean_planet_data/clean_labeled_planets.csv', index=False)

In [None]:
cut = labeled_planets.iloc[:,:3199]
cut.isnull().sum(axis='rows')

In [None]:
# reset index


# clean all_confirmed planets obrital period column to be usable (numeric and no weird symbols)

# for each confirmed planet star
#     search for it in all_confirmed
#     if there is no planet with that star name with an orbit less than the detectable period
#         drop it

#     if it can't be found in all_confirmed
#         drop it and tally how many of these there are

### Set the Light Curves to the Same Time Frame:
That way there are no nulls and we can compare all the light curves from all datasets.

In [None]:
# max row length should be the number of nonmissing values in the shortest clean light curve
# make sure the feature names are consistent and usable

### Save to a New File:

In [None]:
# df_squished.to_csv('../clean_planet_data/clean_labeled_kep_c4.csv', index=False)

In [None]:
# least_null_planets.to_csv('../clean_planet_data/clean_labeled_planets.csv', index=False)

### Done!