# Cleaning and Labeling Notebook:
<hr>

***Make new files where the data is:*** <br>
> Clean, without null values. <br>
> Labeled as detectable exoplanet star or not. <br>

### Imports:

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import time
import numpy as np
import random

%matplotlib inline

### Read in the Files:

In [None]:
# These are the confirmed planet stars from the first download session
c_planets = pd.read_csv('../clean_planet_data/extracted_planets_1_again.csv')

# These are the confirmed planet stars from the second download session
c_planets_2 = pd.read_csv('../clean_planet_data/extracted_confirmed_planets_2_again.csv')

c4_kep = pd.read_csv('../clean_planet_data/extracted_kep_c4_7700_backup.csv')

In [None]:
# Bring in a table that lists all confirmed planets with their star names and other info
all_confirmed = pd.read_csv('../clean_planet_data/all_planets_list.csv')

In [None]:
# drop the last line of c4_kep because it only downloaded halfway before being stopped
c4_kep.drop(index=7713, inplace = True)

# Start Munging:

### Randomly Select Lightcurves from the Duplicates in Confirmed Planets Set:
In the confirmed planets set, there are several light curves for each star. <br>
Randomly selecting the light curve from these will prevent counfounding variables from influencing the model. 

In [None]:
# combine both sets containing confirmed planets
combined_planets = pd.concat([c_planets, c_planets_2])

In [None]:
np.random.seed(112) # set random seed

randomized_planets = pd.DataFrame(columns = c_planets.columns)

i = 0

# Randomly selecting each star's light curves from various time frames
for star in combined_planets['star_name'].unique():
    
    # Print out some feedback to show progress
    if i % 250 == 0:
        print(i)    
    i += 1

    # Randomly select light curve and add it to randomized_planets
    same_star = combined_planets[combined_planets['star_name'] == star] # select all rows whose stars have the same name
    rand_select = random.choice(same_star.index) # randomly select one of the index numbers
    randomized_planets = pd.concat([randomized_planets, same_star[same_star.index == rand_select]]) # add it to the new df

# Reset the index
randomized_planets.reset_index(drop = True, inplace = True)    
print('Finished!')

# Dealing with Nulls:

### Calculate Isolated Missing Values:
Fill 'one-off' missing values with mean imputation of the nearest two values.

In [None]:
for df in [c4_kep, randomized_planets]: # for each dataset
    
    is_null = df.isnull() # make a df of which values are null or not

    for i in range(df.shape[0]): # for each row

        # Print out some feedback to show progress
        if i % 500 == 0:
            print(i)

        for j in range(df.shape[1]-1): # for each column
            if j > 2: # skip the first three columns

                if is_null.iloc[i, j] == True: # if the cell is null
                    if not ((is_null.iloc[i, j-1] == True) | (is_null.iloc[i, j+1] == True)): # and if the adjacent cells are non-null
                        df.iloc[i, j] = np.mean([df.iloc[i, j-1], df.iloc[i, j+1]]) # assign the cell to be the mean of the adjacent cells

### Closing Gaps in Data:

In [None]:
for df in [c4_kep, randomized_planets]: # randomized_planets # for both datasets
    df_count = 0 
    
    # Shifting values to fill nulls
    is_null = df.isnull()
    df_squished = pd.DataFrame()

    for i in range(df.shape[0]): # for each row
        
        # print out some feedback to show progress
        if i % 500 == 0:
            print(i)

        k = 0 # reset the df_squished column index to 0 for each new row
        
        for j in range(df.shape[1]): # for each column
            
            if is_null.iloc[i, j] == False: # if this cell is not null
                df_squished.loc[i, k] = df.iloc[i, j] # add the cell to df_squished
                k += 1
    
    # save df_squished to different variables for each dataset
    if df_count == 0: # if this is the c4_kep set
        c4_kep_squished = df_squished
    else: # for the randomized_planets set
        randomized_planets_squished = df_squished
    df_count += 1

# Assign Labels:

Set labels to 0 (no detectable confirmed exoplanets) to begin with. <br>
Label will be changed to 1 if the star name can be matched with the star names of confirmed exoplanet stars.

In [None]:
# save to a new df before making changes
label_planets = randomized_planets_squished

# set label to 0
label_planets[1] = 0

In [None]:
# save to a new df before making changes
to_label_c4 = c4_kep_squished

# set label to 0
to_label_c4[1] = 0

### Set Labels for Detectable Exoplanet Stars:
Label 1 for exoplanet positive stars. <br>
Do this for all data sets. <br>

***Search for each star name in list of all confirmed exoplanets:*** <br>

In [None]:
# Set detectable period based on length of light curves (66 days)
search_period = 66

for df in [to_label_c4, label_planets]:

    not_found = 0

    for j in range(len(df)): # for every light curve
        if j % 500 == 0:
            print(j)

        count = 0 # keep track of if we find it in confirmed planets list

        for i in range(len(all_confirmed)): # look through each star name in the list of all confirmed planets
            try:
                if all_confirmed.loc[i, 'Alternative star names'].find(df.iloc[j, 0]) != -1:
                    count += 1

                    # make sure it has a detectable orbit
                    orbit = all_confirmed.loc[i, 'Orbital period [days]']

                    item = str(orbit)
                    clean_orbit = ''
                    for letter in item:

                        if letter in ['.', '+', '-', '±']:
                            break
                        elif letter in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
                            clean_orbit = clean_orbit + letter

                    if clean_orbit != '': # the following code will throw an error if the orbit was null
                        clean_orbit = float(clean_orbit)

                        if clean_orbit < search_period: # if this is a detectable planet set label to 1
                            df.loc[j, 1] = 1

            except AttributeError: # if the alternate star names value are null
                try:
                    if all_confirmed.loc[i, 'Star name'].find(df.iloc[j, 0]) != -1:
                        count += 1

                        # make sure it has a detectable orbit
                        orbit = all_confirmed.loc[i, 'Orbital period [days]']

                        item = str(orbit)
                        clean_orbit = ''
                        for letter in item:

                            if letter in ['.', '+', '-', '±']:
                                break
                            elif letter in ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']:
                                clean_orbit = clean_orbit + letter

                        if clean_orbit != '': # the following code will throw an error if the orbit was null
                            clean_orbit = float(clean_orbit)

                            if clean_orbit < search_period: # if this is a detectable planet set label to 1
                                df.loc[j, 1] = 1

                except AttributeError: # if this is null too, keep going.
                    continue

### Check Labels:

In [None]:
to_label_c4[1].value_counts()

In [None]:
label_planets[1].value_counts()

### Remove Undetectable Planet Stars from Exoplanet Light Curve Set:

In [None]:
# Save only the positive labels to a new df
labeled_planets = label_planets[label_planets[1] == 1].loc[:,:3381] # only keep the clean values ending at column 3381

# Reset the index
labeled_planets.reset_index(drop=True, inplace = True)

# Set Column Names

### Prepare List of Column Names:

In [None]:
# labeled_planets.columns
col_to_add = c4_kep.columns

# Convert to a list
col_to_add = list(col_to_add)

# Remove 'exposure' because it is not useful information
col_to_add.remove('exposure')

# add the column name 'label' to position 1 in the list
col_to_add.insert(1, 'label')

### Rename Columns:

In [None]:
# Remove the extra column names
lst = col_to_add[:3382]

# Set column names to labeled_planets
labeled_planets.columns = lst

# Set column names to to_label_c4
to_label_c4.columns = col_to_add[:to_label_c4.shape[1]]

# Save to a New File:

In [None]:
labeled_planets.to_csv('../clean_planet_data/clean_labeled_planets.csv', index=False)

In [None]:
to_label_c4.to_csv('../clean_planet_data/clean_labeled_c4_kep.csv', index=False)

# Done!

<hr>