### Goals in this Notebook:
Make new files where the data is: 
> Clean, without null values. <br>
> Labeled as detectable planet around star (1) or not (0). <br>
> Set to the same time frame. <br>

### Imports:

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np
import random

%matplotlib inline

### Read in the Files:

In [2]:
# These are the confirmed planet stars from the first download session
c_planets = pd.read_csv('../clean_planet_data/extracted_planets_1_again.csv')

# These are the confirmed planet stars from the second download session
c_planets_2 = pd.read_csv('../clean_planet_data/extracted_confirmed_planets_2_again.csv')

c4_kep = pd.read_csv('../clean_planet_data/extracted_kep_c4_7700_backup.csv')

In [3]:
# drop the last line of c4_kep because it only downloaded halfway before being stopped
c4_kep.drop(index=7713, inplace = True)

# Start Munging:

### Randomly Select Lightcurves from the Duplicates in Confirmed Planets Set:
In the confirmed planets set, there are several light curves for each star. <br>
Randomly selecting the light curve from these will prevent counfounding variables from influencing the model. 

In [17]:
# combine both sets containing confirmed planets
combined_planets = pd.concat([c_planets, c_planets_2])

In [33]:
randomized_planets = pd.DataFrame(columns = c_planets.columns)

i = 0

# Randomly selecting each star's light curves from various time frames
for star in combined_planets['star_name'].unique():
    
    # Print out some feedback to show progress
    if i % 250 == 0:
        print(i)    
    i += 1

    # Randomly select light curve and add it to randomized_planets
    same_star = combined_planets[combined_planets['star_name'] == star] # select all rows whose stars have the same name
    rand_select = random.choice(same_star.index) # randomly select one of the index numbers
    randomized_planets = pd.concat([randomized_planets, same_star[same_star.index == rand_select]]) # add it to the new df

# Reset the index
randomized_planets.reset_index(drop = True, inplace = True)    
print('Finished!')

250
500
750
1000
1250


# Dealing with Nulls:

### Calculate Isolated Missing Values:
Fill 'one-off' missing values with mean imputation of the nearest two values.

In [42]:
for df in [c4_kep, randomized_planets]: # for each dataset
    
    is_null = df.isnull() # make a df of which values are null or not

    for i in range(df.shape[0]): # for each row

        # Print out some feedback to show progress
        if i % 500 == 0:
            print(i)

        for j in range(df.shape[1]-1): # for each column
            if j > 2: # skip the first three columns

                if is_null.iloc[i, j] == True: # if the cell is null
                    if not ((is_null.iloc[i, j-1] == True) | (is_null.iloc[i, j+1] == True)): # and if the adjacent cells are non-null
                        df.iloc[i, j] = np.mean([df.iloc[i, j-1], df.iloc[i, j+1]]) # assign the cell to be the mean of the adjacent cells

0
250
500
750
1000
1250
1500
1750
2000
2250
2500
2750
3000
3250
3500
3750
4000
4250
4500
4750
5000
5250
5500
5750
6000
6250
6500
6750
7000
7250
7500
0
250
500
750
1000
1250


### Closing Gaps in Data:

In [45]:
c4_kep.head()

Unnamed: 0,star_name,exposure,c_flux_0,c_flux_1,c_flux_2,c_flux_3,c_flux_4,c_flux_5,c_flux_6,c_flux_7,...,c_flux_4387,c_flux_4388,c_flux_4389,c_flux_4390,c_flux_4391,c_flux_4392,c_flux_4393,c_flux_4394,c_flux_4395,c_flux_4396
0,KIC 1025494,82.716241,,263338.2,263343.7,263349.3,263288.3,263403.4,263315.3,263307.6,...,263326.2,263309.6,263345.0,263305.5,263307.6,263341.2,263333.6,263333.2,263317.6,263352.6
1,KIC 1025578,82.716237,,12923.38,12920.32,12917.26,12913.29,12921.88,12922.11,12911.57,...,12912.79,12917.49,12924.33,12921.21,12911.19,12916.99,12917.56,12932.14,12931.86,12908.6
2,KIC 1025986,82.716227,,1276864.0,1277048.0,1277233.0,1277410.0,1277653.0,1277849.0,1278068.0,...,1272619.0,1273011.0,1273417.0,1273826.0,1274113.0,1274493.0,1274748.0,1275238.0,1275759.0,1276052.0
3,KIC 1026032,82.716228,,17436.18,17432.69,17429.21,17433.79,17430.04,17431.59,17427.26,...,17392.45,17395.75,17392.41,17405.89,17397.01,17388.93,17400.34,17393.58,17389.54,17385.59
4,KIC 1026133,82.716226,,80894.27,80898.23,80902.19,80913.89,80886.92,80895.93,80875.95,...,80906.45,80924.13,80899.6,80911.61,80901.57,80900.39,80929.84,80914.38,80914.08,80908.27


In [153]:
randomized_planets

Unnamed: 0,star_name,exposure,c_flux_0,c_flux_1,c_flux_2,c_flux_3,c_flux_4,c_flux_5,c_flux_6,c_flux_7,...,c_flux_3991,c_flux_3992,c_flux_3993,c_flux_3994,c_flux_3995,c_flux_3996,c_flux_3997,c_flux_3998,c_flux_3999,c_flux_4000
0,KIC 10000941,89.428752,,50976.839844,50979.195312,50969.281250,50985.082031,50980.406250,50999.250000,50997.367188,...,51250.804688,51250.644531,51270.457031,51251.636719,51274.710938,51261.406250,51265.597656,51283.000000,51282.066406,51284.421875
1,KIC 10001368,83.170091,,34929.890625,34944.937500,34951.902344,34945.750000,34939.343750,34950.273438,34953.218750,...,34932.664062,34939.875000,34924.628906,34944.035156,34932.160156,34937.757812,34933.089844,34940.773438,34938.566406,34930.382812
2,KIC 10001893,89.488232,,6409.023438,6402.603516,6414.855469,6411.680176,6411.096680,6406.604004,6411.358398,...,6412.737305,6419.609863,6385.927246,6413.119629,6401.883789,6416.965332,6396.288574,6423.123047,6404.283203,6415.197754
3,KIC 10002866,82.734516,,13770.859375,13767.226562,13774.398438,13777.831055,13790.148438,13795.668945,13798.778320,...,13799.700195,13797.586914,13805.923828,13791.137695,13797.186523,13794.869141,13790.818359,13802.535156,13794.096680,13803.714844
4,KIC 10004519,89.428692,,9705.454102,9704.304688,9704.225586,9721.155273,9710.652344,9711.234375,9708.650391,...,9704.010742,9705.146484,9709.726562,9711.235352,9710.955078,9710.334961,9713.541992,9703.168945,9705.213867,9708.651367
5,KIC 10004738,83.170126,,28549.492188,28565.992188,28562.771484,28560.925781,28568.968750,28565.000000,28563.617188,...,28554.154297,28560.261719,28552.607422,28546.701172,28543.712891,28552.558594,28552.429688,28552.449219,28554.312500,28560.101562
6,KIC 10005788,82.299183,,9880.373047,9885.706055,9874.191406,9875.858398,9883.832031,9886.805664,9888.148438,...,9804.233398,9811.117188,9798.383789,9806.215820,9804.532227,9806.823242,9798.978516,9800.249023,9803.126953,9807.916992
7,KIC 10006581,82.715891,,17660.990234,17655.852539,17650.714844,17655.941406,17657.910156,17661.513672,17658.884766,...,17652.837891,17648.792969,17657.261719,17649.488281,17657.320312,17661.773438,17666.207031,17659.242188,17659.919922,17658.673828
8,KIC 10010440,82.299137,,15685.416992,15691.946289,15710.838867,15696.607422,15681.508789,15678.375977,15686.210938,...,15688.317383,15680.695312,15696.869141,15696.500000,15696.250000,15700.733398,15688.188477,15688.628906,15691.650391,15686.998047
9,KIC 10018233,89.698143,,11776.289062,11770.666016,11779.017578,11782.014648,11782.131836,11779.104492,11786.482422,...,11779.743164,11771.726562,11780.665039,11774.854492,11776.174805,11778.401367,11773.239258,11777.428711,11766.829102,11774.329102


In [148]:
randomized_planets.shape

(1265, 4003)

In [46]:
for df in [randomized_planets]: #c4_kep # for both datasets
#     df_count = 1 
    
    # Shifting values to fill nulls
    is_null = df.isnull()
    df_squished = pd.DataFrame()

    for i in range(df.shape[0]): # for each row
        
        # print out some feedback to show progress
        if i % 100 == 0:
            print(i)

        k = 0 # reset the df_squished column index to 0 for each new row
        
        for j in range(df.shape[1]): # for each column
            
            if is_null.iloc[i, j] == False: # if this cell is not null
                df_squished.loc[i, k] = df.iloc[i, j] # add the cell to df_squished
                k += 1
    
#     # save df_squished to different variables for each dataset
#     if df_count = 1: # if this is the c4_kep set
#         c4_kep_squished = df_squished
#     else: # for the randomized_planets set
randomized_planets_squished = df_squished
#     df_count += 1

0
500
1000


In [156]:
# Do this for c4_kep and confirmed planets

# fix this: the column names will no longer be relevent
check_it_out = randomized_planets_squished.head(1267).iloc[:,:3199]
check_it_out

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3189,3190,3191,3192,3193,3194,3195,3196,3197,3198
0,KIC 10000941,1.0,50976.839844,50979.195312,50969.281250,50985.082031,50980.406250,50999.250000,50997.367188,51002.109375,...,51458.054688,51502.503906,51488.636719,51497.789062,51519.984375,51521.296875,51532.082031,51535.054688,51552.472656,51551.257812
1,KIC 10001368,1.0,34929.890625,34944.937500,34951.902344,34945.750000,34939.343750,34950.273438,34953.218750,34941.695312,...,34929.984375,34940.746094,34937.269531,34937.527344,34939.363281,34929.093750,34945.105469,34955.000000,34930.570312,34935.394531
3,KIC 10002866,1.0,13770.859375,13767.226562,13774.398438,13777.831055,13790.148438,13795.668945,13798.778320,13800.320312,...,13813.326172,13809.694336,13818.818359,13810.759766,13818.966797,13821.005859,13805.796875,13812.212891,13812.047852,13798.031250
4,KIC 10004519,1.0,9705.454102,9704.304688,9704.225586,9721.155273,9710.652344,9711.234375,9708.650391,9713.518555,...,9708.580078,9705.692383,9709.043945,9709.913086,9714.366211,9703.272461,9700.703125,9709.462891,9708.046875,9709.661133
5,KIC 10004738,1.0,28549.492188,28565.992188,28562.771484,28560.925781,28568.968750,28565.000000,28563.617188,28559.498047,...,28540.380859,28564.914062,28566.078125,28556.917969,28554.025391,28554.765625,28555.734375,28565.980469,28557.003906,28557.652344
6,KIC 10005788,1.0,9880.373047,9885.706055,9874.191406,9875.858398,9883.832031,9886.805664,9888.148438,9880.096680,...,9871.015625,9872.560547,9877.399414,9876.719727,9872.230469,9870.846680,9874.615234,9869.755859,9873.184570,9867.596680
7,KIC 10006581,1.0,17660.990234,17655.852539,17650.714844,17655.941406,17657.910156,17661.513672,17658.884766,17647.830078,...,17655.208984,17666.802734,17663.101562,17653.656250,17652.203125,17661.312500,17649.076172,17661.404297,17653.066406,17661.300781
8,KIC 10010440,1.0,15685.416992,15691.946289,15710.838867,15696.607422,15681.508789,15678.375977,15686.210938,15689.397461,...,15685.273438,15700.857422,15689.883789,15700.154297,15690.612305,15693.938477,15700.028320,15689.652344,15695.083984,15686.435547
9,KIC 10018233,1.0,11776.289062,11770.666016,11779.017578,11782.014648,11782.131836,11779.104492,11786.482422,11772.039062,...,11770.364258,11775.452148,11769.339844,11770.879883,11768.898438,11776.395508,11768.343750,11770.771484,11782.024414,11778.720703
10,KIC 10019065,1.0,14441.860352,14450.415039,14451.867188,14446.297852,14440.292969,14442.291016,14442.030273,14450.122070,...,14461.983398,14458.476562,14461.308594,14456.413086,14462.181641,14450.108398,14455.656250,14451.854492,14449.400391,14459.626953


# Assign Labels

### Assign Labels to Stars with Planets:

In [49]:
# Bring in a table that lists all confirmed planets with their star names and other info
all_confirmed = pd.read_csv('../clean_planet_data/all_planets_list.csv')

In [None]:
# Assign labels to c4_kep
not_found = 0

for j in range(len(c4_kep)): # for every light curve
    if j % 250 == 0:
        print(j)

    count = 0
    for i in range(len(all_confirmed)): # look through each star name in the list of all confirmed planets
        try:
            if all_confirmed.loc[i, 'Alternative star names'].find(c4_kep.iloc[j, 0]) != -1:
                count += 1
                print(c4_kep.iloc[j, 0], ' found @ index: ', j, 'orbital period: ', all_confirmed.loc[i, 'Orbital period [days]'])
                c4_kep.loc[j, '1'] = 1
    
        except AttributeError: # if the alternate star names value are null
            try:
                if all_confirmed.loc[i, 'Star name'].find(c4_kep.iloc[j, 0]) != -1:
                    count += 1
                    print(c4_kep.iloc[j, 0], ' found @ index: ', j, 'on 2nd level of loop', 'orbital period: ', all_confirmed.loc[i, 'Orbital period [days]'])
                    c4_kep.loc[j, '1'] = 1
                    
            except AttributeError: # if this is null too, keep going. There are few of these cases in the set
                continue
                
    if count == 0:
        not_found += 1

In [None]:
# change label on confirmed stars with no planets under the timeframe we're looking at
#     add a 'detectable' column?

# drop stars from confirmed planets that cannot be found? how many are there?, can I get this data somewhere else?

### Make Detectable Planets Label for Confirmed Planets Set:
Label should only be positive if the planet has a detectable orbitable period.

In [67]:
for item in all_confirmed['Orbital period [days]']:
    
    item = str(item)
    clean_orbit = ''
    for letter in item:
        
        if letter in ['.', '+', '-', '±']:
            break
        elif letter in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
            clean_orbit = clean_orbit + letter
            
    if clean_orbit != '':
        clean_orbit = float(clean_orbit)
        
        print(clean_orbit, '-----', item)

0.0 ----- 0.1769±0.0000
4.0 ----- 4.1945±0.0000
6.0 ----- 6.3560±0.0001
19.0 ----- 19.2242±0.0001
39.0 ----- 39.0311±0.0002
1.0 ----- 1.5929±0.0000
5.0 ----- 5.235+0.003−0.006
18.0 ----- 18.4279±0.0000
3.0 ----- 3.37
502.0 ----- 502±2
3500.0 ----- 3500
3.0 ----- 3.3518±0.0000
5.0 ----- 5.7532±0.0000
2.0 ----- 2.7058±0.0000
1.0 ----- 1.5804±0.0000
9.0 ----- 9.0601±0.0002
18.0 ----- 18.8702±0.0000
46.0 ----- 46.9023±0.0002
437.0 ----- 437.1±0.3
7053.0 ----- 7053+1624−2324
4.0 ----- 4.4786±0.0006
14.0 ----- 14.48
0.0 ----- 0.6756±0.0000
3.0 ----- 3.0966±0.0000
3380.0 ----- 3380+41−40
8.0 ----- 8.7962±0.0000
7.0 ----- 7.9902±0.0001
237.0 ----- 237.71±0.08
6.0 ----- 6.80
8.0 ----- 8.70
11.0 ----- 11.92
16.0 ----- 16.09
25.0 ----- 25.5
4.0 ----- 4.4542±0.0000
1135.0 ----- 1135
5.0 ----- 5.1134±0.0006
11.0 ----- 11.759±0.006
4.0 ----- 4.4216±0.0000
6.0 ----- 6.0404±0.0000
4.0 ----- 4.1386±0.0000
6.0 ----- 6.6264±0.0000
3.0 ----- 3.7012±0.0000
40.0 ----- 40.0±0.2
6.0 ----- 6.5815±0.0000
2.0 --

19.0 ----- 19.9637±0.0001
2.0 ----- 2.22
34.0 ----- 34.3512±0.0002
2.0 ----- 2.4958±0.0000
5.0 ----- 5.3391±0.0000
428.0 ----- 428.5±1.2
8.0 ----- 8.3609±0.0000
17.0 ----- 17.99
13.0 ----- 13.3916±0.0001
5.0 ----- 5.6290±0.0000
1840.0 ----- 1840
4.0 ----- 4.2286±0.0000
7.0 ----- 7.4666±0.0001
16.0 ----- 16.2595±0.0002
11.0 ----- 11.1318±0.0001
81.0 ----- 81.3151±0.0006
2.0 ----- 2.0806±0.0001
28.0 ----- 28.680±0.009
28.0 ----- 28.8624±0.0001
13.0 ----- 13.6280±0.0001
5.0 ----- 5.9074±0.0000
18.0 ----- 18.18
3.0 ----- 3.7043±0.0000
8.0 ----- 8.0413±0.0000
5.0 ----- 5.1885±0.0000
11.0 ----- 11.7761±0.0000
3.0 ----- 3.7018±0.0000
0.0 ----- 0.6313±0.0000
8.0 ----- 8.3057±0.0001
12.0 ----- 12.5126±0.0001
17.0 ----- 17.9132±0.0003
25.0 ----- 25.2168±0.0007
0.0 ----- 0.8684±0.0000
8.0 ----- 8.0277±0.0000
1.0 ----- 1.7986±0.0000
7.0 ----- 7.4111±0.0000
3.0 ----- 3.3880±0.0000
5.0 ----- 5.8392±0.0000
1.0 ----- 1.6205±0.0000
10.0 ----- 10.6816±0.0001
1684.0 ----- 1684±61
125.0 ----- 125.1±1.1
3.

10.0 ----- 10.0653±0.0000
2.0 ----- 2.0523±0.0000
1.0 ----- 1.512
3.0 ----- 3.5732±0.0000
7.0 ----- 7.6263±0.0000
15.0 ----- 15.9956±0.0001
34.0 ----- 34.2115±0.0003
394.0 ----- 394.3+1.4−1.2
8.0 ----- 8.3985±0.0001
2.0 ----- 2.4842±0.0000
1173.0 ----- 1173±16
8.0 ----- 8.6318±0.0015
25.0 ----- 25.63±0.03
3.0 ----- 3.5465±0.0000
5.0 ----- 5.7008±0.0001
5.0 ----- 5.8706±0.0000
3.0 ----- 3.3294±0.0000
9.0 ----- 9.9775±0.0010
12.0 ----- 12.82
6.0 ----- 6.89
35.0 ----- 35.3
2.0 ----- 2.43
4.0 ----- 4.62
12.0 ----- 12.3334±0.0001
8.0 ----- 8.1452±0.0001
18.0 ----- 18.9984±0.0002
4.0 ----- 4.2444±0.0001
5.0 ----- 5.3426±0.0000
5.0 ----- 5.5765±0.0000
60.0 ----- 60.8662±0.0005
456.0 ----- 456
16.0 ----- 16.7365±0.0000
3.0 ----- 3.69
1.0 ----- 1.8271±0.0000
1129.0 ----- 1129+6−8
2.0 ----- 2.8062±0.0000
8.0 ----- 8.2821±0.0006
0.0 ----- 0.5702±0.0000
84.0 ----- 84.7039±0.0019
2.0 ----- 2.9403±0.0000
6.0 ----- 6.3890±0.0000
14.0 ----- 14.6271±0.0001
35.0 ----- 35.1187±0.0003
11.0 ----- 11.1283±0

4.0 ----- 4.8664±0.0000
10.0 ----- 10.9403±0.0000
2.0 ----- 2.2783±0.0000
66.0 ----- 66.3734±0.0015
8.0 ----- 8.8849±0.0000
5.0 ----- 5.2085±0.0004
60.0 ----- 60.9283±0.0001
56.0 ----- 56.4754±0.0002
92.0 ----- 92.8761±0.0008
13.0 ----- 13.9307±0.0001
94.0 ----- 94.44±0.05
210.0 ----- 210.99±0.08
1070.0 ----- 1070±7
5000.0 ----- 5000+560−2000
14.0 ----- 14.5665+0.0016−0.0020
35.0 ----- 35.7408±0.0002
12.0 ----- 12.3097±0.0001
54.0 ----- 54.3996±0.0005
3.0 ----- 3.92
1.0 ----- 1.9316±0.0000
4.0 ----- 4.9713±0.0000
29.0 ----- 29.6092±0.0003
4.0 ----- 4.7228±0.0000
2.0 ----- 2.3617±0.0000
2.0 ----- 2.7556±0.0000
20.0 ----- 20.3065±0.0000
876.0 ----- 876
434.0 ----- 434±3
28.0 ----- 28.2274±0.0001
24.0 ----- 24.4
17.0 ----- 17.1370±0.0001
536.0 ----- 536
0.0 ----- 0.8818±0.0000
0.0 ----- 0.3811±0.0000
4.0 ----- 4.5101±0.0000
10.0 ----- 10.6166+0.0016−0.0018
10.0 ----- 10.34
1798.0 ----- 1798
1.0 ----- 1.3485±0.0000
3.0 ----- 3.5857±0.0000
33.0 ----- 33.4164±0.0004
12.0 ----- 12.6106±0.0002

45.0 ----- 45.2943±0.0000
102.0 ----- 102.0±1.0
112.0 ----- 112.305+0.002−0.003
891.0 ----- 891
7.0 ----- 7.1397±0.0000
0.0 ----- 0.7621±0.0000
6.0 ----- 6.40
48.0 ----- 48.6304±0.0001
29.0 ----- 29.3932±0.0001
14.0 ----- 14.4359±0.0001
16.0 ----- 16.2237±0.0001
19.0 ----- 19.5975±0.0001
3.0 ----- 3.5839±0.0000
18.0 ----- 18.2490±0.0010
3.0 ----- 3.2957±0.0000
9.0 ----- 9.6535±0.0000
5.0 ----- 5.3159±0.0000
1.0 ----- 1.4991±0.0000
5.0 ----- 5.1858±0.0000
9.0 ----- 9.88
3.0 ----- 3.7222±0.0000
8.0 ----- 8.0880±0.0000
2082.0 ----- 2082
31.0 ----- 31.0034±0.0002
39.0 ----- 39.721±0.006
6.0 ----- 6.9424±0.0001
11.0 ----- 11.4761±0.0001
25.0 ----- 25.2629±0.0006
4.0 ----- 4.3177±0.0000
4093.0 ----- 4093+750−520
2.0 ----- 2.8506±0.0000
1.0 ----- 1.338
5.0 ----- 5.1722±0.0002
5.0 ----- 5.91
8.0 ----- 8.98
672.0 ----- 672
19.0 ----- 19.9158±0.0001
516.0 ----- 516±3
10.0 ----- 10.5263±0.0000
14.0 ----- 14.0349±0.0000
16.0 ----- 16.1047±0.0001
3.0 ----- 3.03
746.0 ----- 746±14
8.0 ----- 8.9810±0

7.0 ----- 7.0257±0.0000
5.0 ----- 5.73
11.0 ----- 11.61
3.0 ----- 3.0571±0.0000
55.0 ----- 55.8227±0.0004
8.0 ----- 8.9909±0.0000
2.0 ----- 2.4037±0.0000
15.0 ----- 15.9315±0.0001
11.0 ----- 11.85
33.0 ----- 33.8
9.0 ----- 9.9407±0.0000
883.0 ----- 883
1.0 ----- 1.8556±0.0000
384.0 ----- 384
13.0 ----- 13.72
242.0 ----- 242
3.0 ----- 3.8959±0.0000
2.0 ----- 2.9419±0.0000
5.0 ----- 5.0172±0.0000
425.0 ----- 425.478±0.011
3267.0 ----- 3267±33
3.0 ----- 3.7557±0.0000
2.0 ----- 2.1395±0.0000
4.0 ----- 4.8071±0.0000
47.0 ----- 47.1612±0.0004
647.0 ----- 647±17
10.0 ----- 10.062±0.002
3.0 ----- 3.2687±0.0000
2.0 ----- 2.5348±0.0000
342.0 ----- 342±6
24.0 ----- 24.9932±0.0001
4.0 ----- 4.8996±0.0000
3.0 ----- 3.9512±0.0000
2.0 ----- 2.8922±0.0000
5.0 ----- 5.9927±0.0000
5.0 ----- 5.1012±0.0000
17.0 ----- 17.4449±0.0000
26.0 ----- 26.672±0.004
4.0 ----- 4.23
6.0 ----- 6.9969±0.0000
151.0 ----- 151.8639±0.0011
4.0 ----- 4.62
241.0 ----- 241
1283.0 ----- 1283
3849.0 ----- 3849
1.0 ----- 1.4325±0

16.0 ----- 16.5408±0.0000
12.0 ----- 12.2830±0.0000
17.0 ----- 17.2512±0.0000
3.0 ----- 3.5439±0.0000
46.0 ----- 46.8279±0.0002
1183.0 ----- 1183
13.0 ----- 13.2218±0.0000
299.0 ----- 299.4±0.3
41.0 ----- 41.4
3.0 ----- 3.20 ·105
56.0 ----- 56.1887±0.0003
3.0 ----- 3.1890±0.0000
3.0 ----- 3.5256±0.0000
4.0 ----- 4.2568±0.0000
1159.0 ----- 1159
2.0 ----- 2.9490±0.0000
1544.0 ----- 1544
32.0 ----- 32.6256±0.0001
91.0 ----- 91.7732±0.0016
2.0 ----- 2.90
1057.0 ----- 1057±5
214.0 ----- 214.7±0.5
117.0 ----- 117.87±0.18
49.0 ----- 49.17±0.04
677.0 ----- 677±8
5700.0 ----- 5700±1500
277.0 ----- 277
16.0 ----- 16.24
94.0 ----- 94.2888±0.0006
6.0 ----- 6.6718±0.0002
16.0 ----- 16.1972±0.0008
70.0 ----- 70.6979±0.0003
8.0 ----- 8.14
32.0 ----- 32.0
432.0 ----- 432
12.0 ----- 12.7580±0.0001
5.0 ----- 5.4703±0.0000
25.0 ----- 25.0985±0.0001
38.0 ----- 38.8718±0.0002
74.0 ----- 74.9201±0.0001
13.0 ----- 13.84
16.0 ----- 16.24
4.0 ----- 4.6059±0.0000
10.0 ----- 10.6761±0.0001
386.0 ----- 386
5.0 --

4.0 ----- 4.2245±0.0000
44.0 ----- 44.5522±0.0008
4.0 ----- 4.4097±0.0000
5.0 ----- 5.3512±0.0006
162.0 ----- 162.0
1156.0 ----- 1156
0.0 ----- 0.7699±0.0000
18.0 ----- 18.8275±0.0001
47.0 ----- 47.0569±0.0002
16.0 ----- 16.3389±0.0000
34.0 ----- 34.4359±0.0001
2.0 ----- 2.9896±0.0000
11.0 ----- 11.2007±0.0001
14.0 ----- 14.0095±0.0001
2.0 ----- 2.49
4.0 ----- 4.69
14.0 ----- 14.07
95.0 ----- 95.4
11.0 ----- 11.9540±0.0001
65.0 ----- 65.6488±0.0007
2.0 ----- 2.8606±0.0003
12.0 ----- 12.7597±0.0000
5.0 ----- 5.4331±0.0000
25.0 ----- 25.7520±0.0001
51.0 ----- 51.1658±0.0006
62.0 ----- 62.8692±0.0010
5.0 ----- 5.8360±0.0000
1.0 ----- 1.5947±0.0000
160.0 ----- 160.8847±0.0001
2.0 ----- 2.7141±0.0000
6.0 ----- 6.5100±0.0000
22.0 ----- 22.2081±0.0000
1.0 ----- 1.4347±0.0000
953.0 ----- 953±9
269.0 ----- 269.3±2.0
912.0 ----- 912
18.0 ----- 18.20
136.0 ----- 136.8±0.2
0.0 ----- 0.9685±0.0000
4.0 ----- 4.2437±0.0000
14.0 ----- 14.7888±0.0001
9.0 ----- 9.8482±0.0001
19.0 ----- 19.7217±0.0002
7.

In [108]:
testing = randomized_planets_squished

In [109]:
testing[1] = 0 # change what used to be the exposure column to contain labels of 0 (for now)

In [110]:
testing.head(1)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3905,3906,3907,3908,3909,3910,3911,3912,3913,3914
0,KIC 10000941,0,50976.839844,50979.195312,50969.28125,50985.082031,50980.40625,50999.25,50997.367188,51002.109375,...,,,,,,,,,,


In [111]:
not_found = 0

for j in range(len(testing)): # for every light curve
    if j % 250 == 0:
        print(j)

    count = 0 # keep track of if we find it in confirmed planets list
    
    for i in range(len(all_confirmed)): # look through each star name in the list of all confirmed planets
        try:
            if all_confirmed.loc[i, 'Alternative star names'].find(testing.iloc[j, 0]) != -1:
                count += 1
                
                # make sure it has a detectable orbit
                orbit = all_confirmed.loc[i, 'Orbital period [days]']
                
                item = str(orbit)
                clean_orbit = ''
                for letter in item:

                    if letter in ['.', '+', '-', '±']:
                        break
                    elif letter in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
                        clean_orbit = clean_orbit + letter

                if clean_orbit != '': # the following code will throw an error if the orbit was null
                    clean_orbit = float(clean_orbit)

                    if clean_orbit < 66: # if this is a detectable planet set label to 1
                        testing.loc[j, 1] = 1
                            
        except AttributeError: # if the alternate star names value are null
            try:
                if all_confirmed.loc[i, 'Star name'].find(testing.iloc[j, 0]) != -1:
                    count += 1
                    
                    # make sure it has a detectable orbit
                    orbit = all_confirmed.loc[i, 'Orbital period [days]']

                    item = str(orbit)
                    clean_orbit = ''
                    for letter in item:

                        if letter in ['.', '+', '-', '±']:
                            break
                        elif letter in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
                            clean_orbit = clean_orbit + letter

                    if clean_orbit != '': # the following code will throw an error if the orbit was null
                        clean_orbit = float(clean_orbit)

                        if clean_orbit < 66: # if this is a detectable planet set label to 1
                            testing.loc[j, 1] = 1
                    
            except AttributeError: # if this is null too, keep going.
                continue
                
    # if the star could not be found, drop this row            
#     if count == 0:
#         testing.drop(index = j, inplace=True)

0
250
500
750
1000


In [112]:
testing[1].value_counts()

1.0    1143
0.0     121
Name: 1, dtype: int64

In [116]:
labeled_planets = testing[testing[1] == 1]

In [123]:
labeled_planets.reset_index(drop=True, inplace = True)

In [125]:
# labeled_planets.to_csv('../clean_planet_data/clean_labeled_planets.csv', index=False)

In [132]:
cut = labeled_planets.iloc[:,:3199]
cut.isnull().sum(axis='rows')

0       14
1        0
2       14
3       14
4       14
5       14
6       14
7       14
8       14
9       14
10      14
11      14
12      14
13      14
14      14
15      14
16      14
17      14
18      14
19      14
20      14
21      14
22      14
23      14
24      14
25      14
26      14
27      14
28      14
29      14
        ..
3169    14
3170    14
3171    14
3172    14
3173    14
3174    14
3175    14
3176    14
3177    14
3178    14
3179    14
3180    14
3181    14
3182    14
3183    14
3184    14
3185    14
3186    14
3187    14
3188    14
3189    14
3190    14
3191    14
3192    14
3193    14
3194    14
3195    14
3196    14
3197    14
3198    14
Length: 3199, dtype: int64

In [None]:
# reset index


# clean all_confirmed planets obrital period column to be usable (numeric and no weird symbols)

# for each confirmed planet star
#     search for it in all_confirmed
#     if there is no planet with that star name with an orbit less than the detectable period
#         drop it

#     if it can't be found in all_confirmed
#         drop it and tally how many of these there are

### Set the Light Curves to the Same Time Frame:
That way there are no nulls and we can compare all the light curves from all datasets.

In [None]:
# max row length should be the number of nonmissing values in the shortest clean light curve
# make sure the feature names are consistent and usable

### Save to a New File:

In [None]:
# df_squished.to_csv('../clean_planet_data/clean_labeled_kep_c4.csv', index=False)

In [None]:
# least_null_planets.to_csv('../clean_planet_data/clean_labeled_planets.csv', index=False)

### Done!