In [126]:
import pandas as pd
import os
from astropy.io import fits
import random
import shutil
import numpy as np
from shutil import copyfile

#### Data Background

In [2]:
# There are seven files containing data approporiate for initial review,
# descriptions of which can be found in the comment sections below.
#
# Tables are each provided from http://efigi.org 
#
# Table history is as follows:
# Mar 29, 2011: initial 1.6 release.
# May 16, 2011: v1.6.1 fixes coordinate swaps in EFIGI_coord_redshift table.
# Jul 28, 2011: v1.6.2 fixes corrupted B/T values for some early types.
#
# This archive contains version 1.6.2 of the EFIGI reference dataset. The EFIGI
# reference dataset contains SDSS images and visual morphology for 4458
# galaxies from the RC3 catalogue. See Baillard et al. 2011 (A&A 532, A74) and
# http://efigi.org for details.
#
# Any use of these data must reference :
# Baillard, B., Bertin, E., de Lapparent, V., et al. 2011, A&A, 532, A74
#
# Comments on table content should be sent to the EFIGI user forum: 
# http://efigi.org/forum
# or to Valerie de Lapparent (lapparen at iap.fr) and Emmanuel Bertin (bertin at iap.fr)
#
# The EFIGI dataset is distributed in 7 separate compressed archives (gzipped
# tar format):
# - efigi_tables-1.6.2.tgz: 6 ASCII tables, including morphological information
# - efigi_ima_u-1.6.tgz: 4458 galaxy images in the SDSS u-band (FITS format)
# - efigi_ima_g-1.6.tgz: 4458 galaxy images in the SDSS g-band (FITS format)
# - efigi_ima_r-1.6.tgz: 4458 galaxy images in the SDSS r-band (FITS format)
# - efigi_ima_i-1.6.tgz: 4458 galaxy images in the SDSS i-band (FITS format)
# - efigi_ima_z-1.6.tgz: 4458 galaxy images in the SDSS z-band (FITS format)
# - efigi_psf-1.6.tgz: 4458x5 PSF images

In [3]:
# EFIGI_attributes.txt version 1.6.2 :
#
# This file contains the PGC name, the EFIGI attributes and 
# the corresponding lower and upper bounds of confidence intervals
# for the 4458 galaxies of the EFIGI catalogue
#
# Description  
#--------------
#
#    PGC_name              PGC name		
#    T                     EFIGI morphological type 
#    T_inf         	   Lower confidence limit in T
#    T_sup                 Upper confidence limit in T
#    Bulge_to_Total        Bulge-to-total ratio
#    Bulge_to_Total_inf    Lower confidence limit in Bulge_to_Total
#    Bulge_to_Total_sup    Upper confidence limit in Bulge_to_Total
#    Arm_Strength          Strength of spiral arms
#    Arm_Strength_inf      Lower confidence limit in Arm_Strength
#    Arm_Strength_sup      Upper confidence limit in Arm_Strength
#    Arm_Curvature         Average curvature of the spiral arms 
#    Arm_Curvature_inf     Lower confidence limit in Arm_Curvature
#    Arm_Curvature_sup     Upper confidence limit in Arm_Curvature
#    Arm_Rotation          Direction of winding of the spiral arms 
#    Arm_Rotation_inf      Lower confidence limit in Arm_Rotation
#    Arm_Rotation_sup      Upper confidence limit in Arm_Rotation
#    Bar_Length            Length of central bar   
#    Bar_Length_inf        Lower confidence limit in Bar_Length
#    Bar_Length_sup        Upper confidence limit in Bar_Length
#    Inner_Ring            Strength of inner ring, lens or inner pseudo-ring
#    Inner_Ring_inf        Lower confidence limit in Inner_Ring  
#    Inner_Ring_sup        Upper confidence limit in Inner_Ring 
#    Outer_Ring            Strength of outer ring 
#    Outer_Ring_inf        Lower confidence limit in Outer_Ring
#    Outer_Ring_sup        Upper confidence limit in Outer_Ring
#    Pseudo_Ring           Type and strength of outer pseudo-ring 
#    Pseudo_Ring_inf       Lower confidence limit in Pseudo_Ring
#    Pseudo_Ring_sup       Upper confidence limit in Pseudo_Ring
#    Perturbation          Deviation from rotational symmetry
#    Perturbation_inf      Lower confidence limit in Perturbation
#    Perturbation_sup      Upper confidence limit in Perturbation
#    Visible_Dust          Strength of dust features 
#    Visible_Dust_inf      Lower confidence limit in Visible_Dust
#    Visible_Dust_sup      Upper confidence limit in Visible_Dust
#    Dust_Dispersion       Patchiness of dust features
#    Dust_Dispersion_inf   Lower confidence limit in Dust_Dispersion
#    Dust_Dispersion_sup   Upper confidence limit in Dust_Dispersion
#    Flocculence           Strength of scattered HII regions  
#    Flocculence_inf       Lower confidence limit in Flocculence
#    Flocculence_sup       Upper confidence limit in Flocculence
#    Hot_Spots             Strength of regions of strong star formation, active nuclei, or stellar nuclei 
#    Hot_Spots_inf         Lower confidence limit in Hot_Spots
#    Hot_Spots_sup         Upper confidence limit in Hot_Spots
#    Inclination           Inclination of disks or elongation of spheroids 
#    Inclination_inf       Lower confidence limit in Inclination
#    Inclination_sup       Upper confidence limit in Inclination
#    Contamination         Severity of contamination by stars, galaxies or artifacts
#    Contamination_inf     Lower confidence limit in Contamination
#    Contamination_sup     Upper confidence limit in Contamination
#    Multiplicity          Abundance of neighbouring galaxies 
#    Multiplicity_inf      Lower confidence limit in Multiplicity
#    Multiplicity_sup      Upper confidence limit in Multiplicity
#
# Notes  
#-------
#
#    Objects are ordered with increasing PGC number
#    Attribute values and confidence limits take values : 0, 0.25, 0.5, 0.75, or 1.0
#    Attribute and confidence limits of -1 are undetermined

In [4]:
# EFIGI_coord_redshift.txt version 1.6.2 :
#
# This file contains the PGC name, the J2000 EFIGI corrected coordinates, the EFIGI 
# selected redshifts and corresponding distances for the 4458 galaxies of 
# the EFIGI catalogue
#
# Description  
#--------------
#
#    PGC_name        PGC name
#    RA       	     Right ascension J2000 (degrees)
#    DEC   	         Declination J2000 (degrees)
#    z_hel           Selected heliocentric redshift 
#    z_hel_err       Uncertainty in selected heliocentric redshift 
#    z_hel_cat       Catalogue of redshift correction for Local Group infall into Virgo redshift 
#    z_dis           Selected redshift corrected for Local Group infall into Virgo 
#    z_dis_err       Uncertainty in selected redshift corrected for Local Group infall into Virgo 
#    z_dis_cat       Catalogue of redshift correction for Local Group infall into Virgo redshift 
#    z_all           z_dis, or z_hel if z_dis undertermined 
#    z_all_type      Type of redshift used for z_all 
#    z_all_err       Uncertainty in redshift used for z_all 
#    z_all_err_type  Type of redshift used for uncertainty in z_all 
#    D_com           Comoving distance derived from z_all (Mpc) 
#    D_lum           Luminosity distance derived from z_all (Mpc) 
#    D_diam          Transverse diameter distance derived z_all (Mpc) 
#
# Notes  
#-------
#
#  z_hel_cat =      HL if HyperLeda heliocentric redshift stored in z_hel
#            =      NED if NED      heliocentric redshift stored in z_hel
#            =      SDSS if SDSS    heliocentric redshift stored in z_hel
#  z_dis_cat =      HL if stored in z_dis the HyperLeda corrected redshift for Local Group infall into Virgo
#            =      NED-vir-HL if stored in z_dis the NED heliocentric redshift corrected with the HyperLeda correction 
#                   for Local Group infall into Virgo
#  z_all_type =     DIS if z_dis stored in z_all
#             =     HEL if z_hel stored in z_all
#  z_all_err_type = HEL if z_hel_err is stored in z_all_err
#                   DIS if z_dis_err is stored in z_all_err

In [5]:
# EFIGI_HyperLeda.txt version 1.6.2 :
#
# This file contains the PGC name, the HyperLeda velocities, redshifts, 
# types and names for the 4458 galaxies of the EFIGI catalogue
#
# Description  
#--------------
#
#      PGC_name	     PGC name		
#      PGC_no	     PGC number		
#      vrad	         Heliocentric radial velocity from radio measurement (km/s)
#      e_vrad	     Actual error on vrad (km/s)
#      vopt	         Heliocentric radial velocity from optical measurement (km/s)
#      e_vopt	     Actual error on vopt (km/s)
#      v	         Mean heliocentric radial velocity (km/s)
#      e_v	         Actual error on v (km/s)
#      vvir	         Radial velocity corrected for Local Group infall into Virgo (km/s)
#      zvir	         Redshift corrected for Local Group infall into Virgo 
#      z_err         Redshit error derived from e_v 
#      type          Morphological type 
#      objname	     Principal designation
#      hl_names      List of all object names
#
# Notes  
#-------
#
#    Objects are ordered with increasing PGC number
#    Undefined values of velocity are listed as -9999
#    Undefined values of redshift are listed as -99.99

In [6]:
# EFIGI_NED.txt version 1.6.2:
#
# This file contains the PGC name, the NED velocity, redshift and object name
# for the 4458 galaxies of the EFIGI catalogue
#
# Description  
#--------------
#
#    PGC_name      PGC name		
#    cz	           Heliocentric velocity (km/s) 
#    redshift 	   Redshift
#    nedname       Object name 		
#
# Notes  
#-------
#
#    Objects are ordered with increasing PGC number
#    Undefined values of velocity are listed as -9999
#    Undefined values of redshift are listed as -99.99

In [7]:
# EFIGI_PGC.txt version 1.6.2 :
#
# This file contains the PGC name and other PGC parameters for the 
# 4458 galaxies of the EFIGI catalogue
#
# Description  
#--------------
#
#   PGC_name      PGC name
#   T_PGC         RC3 morphological type
#   e_T_PGC       Mean error on T_PGC
#   type	      Expanded morphological type			
#   D25           log(D_{25}), decimal logarithm of mean apparent major
#                 isophotal diameter measured at or reduced to surface brightness
#                 level 25.0 B/mag^2 (in units of 0.1 arcmin)
#   R25           log(R_{25}) decimal logarithm of ratio of mean major isophotal 
#                 diameter, D_{25}, to mean minor isophotal diameter measured at 
#                 or reduced to the surface brightness level 25.0 B/mag^2
#   PA		      Position angle of major axis (degrees)
#   B_T_mag	      Total B magnitude
#   e_B_T_mag     Mean error on B_T_mag
#   B_V_T	      Total (B-V)
#   e_B_V_T	      Mean error on total (B-V)
#   cz		      Heliocentric velocity (km/s)
#   z             Redshift  
#
# Notes  
#-------
#
#    Undefined values of D25, R25, B_T_mag, e_B_T_mag, B_V_T, e_B_V_T, z, are listed as -99.99
#    Undefined values of PA and cz are listed as -9999

In [8]:
# EFIGI_SDSS.txt version 1.6.2 :
#
# This file contains the PGC name, the unique SDSS photometric identifier 
# and SDSS spectroscopic identifier, the SDSS redshift, uncertainty and 
# confidence level, for the 4458 galaxies of the EFIGI catalogue
#
# Description  
#--------------
#
#      PGC_name	     PGC name		
#      objID	     Unique SDSS photometric identifier composed from skyVersion, rerun, run, camcol, field, obj
#      specObjID     Unique SDSS spectroscopic identifier
#      z  	         Redshift 
#      zErr  	     Uncertainty in redshift 
#      zConf         Confidence level in redshift 
#
# Notes  
#-------
#
#    Objects are ordered with increasing PGC number
#    Undefined values of SDSS photometric and spectroscopic identifiers are listed as "none"
#    Undefined values of z, zErr and zConf are listed as -99.99

#### Reduce Image File Names with OS

This is a small stretch goal- the project can be completed and reproduced even if this step is skipped. A common prefix of "PGC00" exists, which can be found running the following command from the Python os package. Resulting filenames will still have considerable length and will begin with one or more "0"s.

In [50]:
# os.path.commonprefix("efigi-1.6-png/png")

#### Import Data Txt Files to Dataframe

Each data file contains contains varying delimters, making parsing difficult as a CSV.
For reproducability, "\s+" should be used a separator.

In [35]:
attributes = pd.read_csv('efigi-1.6/EFIGI_attributes.txt', sep="\s+")

In [37]:
coord_redshift = pd.read_csv('efigi-1.6/EFIGI_coord_redshift.txt', sep="\s+")

In [38]:
hyperleda = pd.read_csv('efigi-1.6/EFIGI_HyperLeda.txt', sep="\s+")

In [39]:
ned = pd.read_csv('efigi-1.6/EFIGI_NED.txt', sep="\s+")

In [40]:
pgc = pd.read_csv('efigi-1.6/EFIGI_PGC.txt', sep="\s+")

In [41]:
sdss = pd.read_csv('efigi-1.6/EFIGI_SDSS.txt', sep="\s+")

In [42]:
attributes

Unnamed: 0,PGCname,T,T_inf,T_sup,Bulge_to_Total,Bulge_to_Total_inf,Bulge_to_Total_sup,Arm_Strength,Arm_Strength_inf,Arm_Strength_sup,...,Hot_Spots_sup,Inclination,Inclination_inf,Inclination_sup,Contamination,Contamination_inf,Contamination_sup,Multiplicity,Multiplicity_inf,Multiplicity_sup
0,PGC0000212,1,0,2,0.50,0.50,0.50,0.25,0.25,0.25,...,0.25,0.75,0.75,0.75,0.00,0.00,0.25,0.25,0.25,0.25
1,PGC0000218,0,-1,1,0.75,0.50,0.75,-1.00,-1.00,-1.00,...,0.00,1.00,1.00,1.00,0.25,0.25,0.25,0.00,0.00,0.00
2,PGC0000243,-2,-3,-2,1.00,0.75,1.00,0.00,0.00,0.25,...,0.25,0.00,0.00,0.25,0.00,0.00,0.00,0.00,0.00,0.00
3,PGC0000255,9,9,10,0.00,0.00,0.25,0.00,0.00,0.25,...,0.25,0.25,0.00,0.25,0.25,0.00,0.25,0.00,0.00,0.00
4,PGC0000281,7,6,9,0.25,0.00,0.50,0.00,0.00,0.25,...,0.75,0.25,0.00,0.25,0.50,0.50,0.50,0.25,0.00,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,PGC0072806,6,4,8,0.25,0.25,0.25,0.00,0.00,0.50,...,0.25,0.25,0.25,0.50,0.00,0.00,0.00,0.00,0.00,0.00
4454,PGC0072922,4,3,4,0.25,0.25,0.50,0.50,0.50,0.50,...,0.25,0.00,0.00,0.25,0.50,0.50,0.50,0.00,0.00,0.00
4455,PGC0072927,3,2,4,0.50,0.50,0.75,0.25,0.25,0.50,...,0.25,0.00,0.00,0.25,0.00,0.00,0.25,0.00,0.00,0.00
4456,PGC0072930,3,2,4,0.75,0.50,0.75,0.00,0.00,0.25,...,0.25,0.50,0.25,0.50,0.25,0.00,0.25,0.00,0.00,0.00


In [43]:
coord_redshift

Unnamed: 0,PGC_name,RA,DEC,z_hel,z_hel_err,z_hel_cat,z_dis,z_dis_err,z_dis_cat,z_all,z_all_type,z_all_err,z_all_err_type,D_com,D_lum,D_diam
0,PGC0000212,0.797083,15.965056,0.037460,0.000030,HL,0.037650,-99.99,HL,0.037650,DIS,0.000030,HEL,160.0045,166.0286,154.1989
1,PGC0000218,0.812125,16.145417,0.003501,0.000015,HL,0.003699,-99.99,HL,0.003699,DIS,0.000015,HEL,15.8343,15.8929,15.7759
2,PGC0000243,0.883917,-10.744583,0.029310,-99.990000,NED,0.029070,-99.99,NED-vir-HL,0.029070,DIS,-99.990000,none,123.6766,127.2719,120.1829
3,PGC0000255,0.929958,15.218306,0.002929,0.000014,HL,0.003112,-99.99,HL,0.003112,DIS,0.000014,HEL,13.2682,13.3095,13.2270
4,PGC0000281,1.002500,-11.177778,0.038330,0.000055,HL,0.038080,-99.99,HL,0.038080,DIS,0.000055,HEL,161.9017,168.0669,155.9627
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,PGC0072806,358.544583,0.393889,0.049870,-99.990000,NED,0.049820,-99.99,NED-vir-HL,0.049820,DIS,-99.990000,none,211.1033,221.6205,201.0853
4454,PGC0072922,359.016333,-0.988472,0.022250,0.000042,HL,0.022180,-99.99,HL,0.022180,DIS,0.000042,HEL,94.6472,96.7465,92.5935
4455,PGC0072927,359.029167,-0.916944,0.024280,0.000017,HL,0.024210,-99.99,HL,0.024210,DIS,0.000017,HEL,103.1315,105.6283,100.6937
4456,PGC0072930,359.032500,0.549167,0.022240,0.000014,HL,0.022190,-99.99,HL,0.022190,DIS,0.000014,HEL,94.6472,96.7474,92.5926


In [44]:
hyperleda

Unnamed: 0,PGC_name,PGC_no,vrad,e_vrad,vopt,e_vopt,v,e_v,vvir,zvir,z_err,type,objname,hl_names
0,PGC0000212,212.0,11230.4,4.5,11110.0,47.0,11229.3,9.0,11287.9,0.037650,0.000030,Sab,IC5381,PGC000212
1,PGC0000218,218.0,1050.3,4.8,1027.4,25.0,1049.5,4.5,1109.0,0.003699,0.000015,Sab,NGC7814,PGC1501809
2,PGC0000243,243.0,-9999.0,-9999.0,8914.3,16.3,8914.3,16.3,8841.8,0.029490,0.000054,S0,NGC7808,6dFJ0003321-104441
3,PGC0000255,255.0,878.1,4.1,-9999.0,-9999.0,878.1,4.1,932.9,0.003112,0.000014,Sm,UGC00017,PGC000255
4,PGC0000281,281.0,-9999.0,-9999.0,11490.7,16.4,11490.7,16.4,11416.0,0.038080,0.000055,Sc,PGC000281,MCG-02-01-015
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,PGC0072806,72810.0,-9999.0,-9999.0,18356.6,15.9,18356.6,15.9,18341.9,0.061180,0.000053,S0-a,PGC072806,HCG098D
4454,PGC0072922,72920.0,6664.7,7.3,6717.8,20.8,6670.6,12.6,6648.3,0.022180,0.000042,Sb,IC1515,PGC197276
4455,PGC0072927,72930.0,7279.8,7.7,7279.0,18.5,7279.7,5.1,7257.8,0.024210,0.000017,Sbc,IC1516,HIPASSJ2356-00
4456,PGC0072930,72930.0,6668.0,5.2,6651.8,29.1,6667.5,4.1,6652.9,0.022190,0.000014,S0-a,NGC7787,MCG+00-01-005


In [45]:
ned

Unnamed: 0,PGC_name,cz,redshift,nedname
0,PGC000212,11231,0.037460,IC_5381
1,PGC000218,1050,0.003502,NGC_7814
2,PGC000243,8787,0.029310,NGC_7808
3,PGC000255,878,0.002929,UGC_00017
4,PGC000281,11491,0.038330,MCG_-02-01-015
...,...,...,...,...
4453,PGC072806,14950,0.049870,UGC_12837/NOTES01
4454,PGC072922,6665,0.022230,IC_1515
4455,PGC072927,7286,0.024300,IC_1516
4456,PGC072930,6664,0.022230,NGC_7787


In [46]:
pgc

Unnamed: 0,PGC_name,T_PGC,T_PGC_err,type,D25,R25,PA,B_T_mag,e_B_T_mag,B_V_T,e_B_V_T,cz,z
0,PGC0000212,2.0,1.8,.S..2$/,1.16,0.58,54,-99.99,-99.99,-99.99,-99.99,-9999,-99.990000
1,PGC0000218,2.0,0.3,.SAS2*/,1.74,0.38,135,11.56,0.13,0.99,0.02,1042,0.003476
2,PGC0000243,-2.0,1.2,PLA.0*.,1.10,-99.99,-9999,13.48,0.13,0.85,0.03,8923,0.029760
3,PGC0000255,9.3,0.6,.S..9*.,1.39,0.15,-9999,14.80,0.20,0.61,0.09,-9999,-99.990000
4,PGC0000281,1.0,1.7,.S..1?.,0.34,0.12,-9999,-99.99,-99.99,-99.99,-99.99,-9999,-99.990000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,PGC0072806,-1.0,0.9,.L..+$.,0.42,-99.99,-9999,-99.99,-99.99,-99.99,-99.99,-9999,-99.990000
4454,PGC0072922,2.0,0.5,PSBT2..,1.06,0.09,160,-99.99,-99.99,-99.99,-99.99,6726,0.022440
4455,PGC0072927,4.3,0.5,.S..4P.,1.24,0.04,65,-99.99,-99.99,-99.99,-99.99,7300,0.024350
4456,PGC0072930,0.0,0.6,PSBT0*.,1.25,0.58,104,-99.99,-99.99,-99.99,-99.99,-9999,-99.990000


In [47]:
sdss

Unnamed: 0,PGC_name,objId,SpecObjId,z,zErr,zConf
0,PGC0000212,587730775499735086,211330582687252480,0.037450,0.000187,0.9988
1,PGC0000218,587727223561388198,none,-99.990000,-99.990000,-99.9900
2,PGC0000243,587727178449879045,183182687387779072,0.029610,0.000177,0.9987
3,PGC0000255,587727222487646439,211330582808887296,0.002882,0.000139,0.7125
4,PGC0000281,587727177913008262,183182687425527808,0.038340,0.000079,0.9948
...,...,...,...,...,...,...
4453,PGC0072806,587731186740166867,192754470772277248,-100.000000,-100.000000,-99.9900
4454,PGC0072922,587731185129750564,none,-99.990000,-99.990000,-99.9900
4455,PGC0072927,587731185129750586,109153257862987776,0.024300,0.000162,0.9996
4456,PGC0072930,588015509805793362,108871771410661376,0.022220,0.000086,0.9991


#### Starting with our simple Target

While stretch goal iterations of this project contibue with the examination of analysis of the larger datsets created above, we will begin by isolating our imag classification target. In the 'attributes' table, 'T' column indicates the EFIGI morphological type.

In [49]:
attributes['T'].value_counts()

 3     517
 4     472
 6     448
 8     355
 5     303
 7     285
 9     263
 1     257
 10    248
-5     227
 2     219
-2     196
 0     196
-3     189
-1     152
 11     69
-4      44
-6      18
Name: T, dtype: int64

Above we see the wide variances of types.

We extract the 'T' column and the 'PGCname' column to relate the values to our image file names.

In [53]:
target_df = attributes.filter(['PGCname', 'T'], axis=1)

In [54]:
target_df

Unnamed: 0,PGCname,T
0,PGC0000212,1
1,PGC0000218,0
2,PGC0000243,-2
3,PGC0000255,9
4,PGC0000281,7
...,...,...
4453,PGC0072806,6
4454,PGC0072922,4
4455,PGC0072927,3
4456,PGC0072930,3


#### Separating Images by Class

Our later stretch iterations will interact with the FITS image files in order to allow modeling on varying bands of each image. 

Using the fits command below from the astropy.io package on the first image in the i band, we see that the fits files only contain a Header (in this case an image). If there were ASCII or binary tables in each fits file as well, we would want to utilize the fits files even in our base classification. Since there is not, we can use the png images for now.

We will us the Python os package to separate our images by class.

PLEASE NOTE THAT THIS PROCESS BELOW WAS PREVIOUSLY DONE IN A HACKJOB WAY FOR FSM, THESE FUNCTIONS ARE A NEW PIECE I BEGAN TODAY FOR REPORODUCABILITY.

In [58]:
hdu_info = fits.open('efigi-images/psf_i/psf_PGC0000212_i.fits')
hdu_info.info()

Filename: efigi-images/psf_i/psf_PGC0000212_i.fits
No.    Name      Ver    Type      Cards   Dimensions   Format
  0  chip01        1 PrimaryHDU      19   (45, 45)   float32   


In [75]:
def T_to_string():
    attributes['T'] = attributes['T'].astype(str)
    return attributes

In [76]:
T_to_string()

Unnamed: 0,PGCname,T,T_inf,T_sup,Bulge_to_Total,Bulge_to_Total_inf,Bulge_to_Total_sup,Arm_Strength,Arm_Strength_inf,Arm_Strength_sup,...,Hot_Spots_sup,Inclination,Inclination_inf,Inclination_sup,Contamination,Contamination_inf,Contamination_sup,Multiplicity,Multiplicity_inf,Multiplicity_sup
0,PGC0000212,1,0,2,0.50,0.50,0.50,0.25,0.25,0.25,...,0.25,0.75,0.75,0.75,0.00,0.00,0.25,0.25,0.25,0.25
1,PGC0000218,0,-1,1,0.75,0.50,0.75,-1.00,-1.00,-1.00,...,0.00,1.00,1.00,1.00,0.25,0.25,0.25,0.00,0.00,0.00
2,PGC0000243,-2,-3,-2,1.00,0.75,1.00,0.00,0.00,0.25,...,0.25,0.00,0.00,0.25,0.00,0.00,0.00,0.00,0.00,0.00
3,PGC0000255,9,9,10,0.00,0.00,0.25,0.00,0.00,0.25,...,0.25,0.25,0.00,0.25,0.25,0.00,0.25,0.00,0.00,0.00
4,PGC0000281,7,6,9,0.25,0.00,0.50,0.00,0.00,0.25,...,0.75,0.25,0.00,0.25,0.50,0.50,0.50,0.25,0.00,0.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4453,PGC0072806,6,4,8,0.25,0.25,0.25,0.00,0.00,0.50,...,0.25,0.25,0.25,0.50,0.00,0.00,0.00,0.00,0.00,0.00
4454,PGC0072922,4,3,4,0.25,0.25,0.50,0.50,0.50,0.50,...,0.25,0.00,0.00,0.25,0.50,0.50,0.50,0.00,0.00,0.00
4455,PGC0072927,3,2,4,0.50,0.50,0.75,0.25,0.25,0.50,...,0.25,0.00,0.00,0.25,0.00,0.00,0.25,0.00,0.00,0.00
4456,PGC0072930,3,2,4,0.75,0.50,0.75,0.00,0.00,0.25,...,0.25,0.50,0.25,0.50,0.25,0.00,0.25,0.00,0.00,0.00


In [111]:
def create_img_dirs(df):
    '''
    Separates images by class into folders within main image directory.
    
    img_dir = main image directory
    '''
    
    # create list of class names  
    class_names = df.iloc[:,2].unique().tolist()
    
    os.mkdir('./img')
    
    # create folder for each class name
    for x in class_names:
        os.mkdir(f'img/{x}')


In [131]:
def assign_names_by_target(df):
    '''
    Sort
    '''
    # pull PGCnames from dataframe
    filenames = df['PGCname']
    targets = df['T']
    
    for PGCname, T in zip(filenames, targets):
        pathname = 'efigi-1.6-png/png/' + PGCname + '.png'
        destpath = 'img/' + T + '/' + PGCname + '.png'
        
        copyfile(pathname, destpath)

In [132]:
assign_names_by_target(attributes)

In [None]:
def split_class_by_folder(img_dir, train_size, test_size, val_size):
    '''
    Separates all images into train, test, and val folders with
    stratified classes.
    
    img_dir = main image directory
    train_size = percent of images to be added to train folder
    test_size = percent of images to be added to test folder
    val_size = percent of images to be added to validation folder
    '''
    
    # get subdirs from main image folder
    sub_dirs = [sub_dir for sub_dir in os.listdir(img_dir)
                if os.path.isdir(os.path.join(imd_dir, sub_dir))]
    
    # create the folders
    if not os.path.exists('images'):
        os.mkdirs('images')
    if not os.path.exists('images/train'):
        os.mkdirs('images/train')
    if not os.path.exists('images/test'):
        os.mkdirs('images/test')
    if not os.path.exists('images/val'):
        os.mkdirs('images/val')
    

    # create subdirectories for train, test, val folders
    for sub_dir in sub_dirs:
        train_sub_dir = os.path.join('images/train', sub_dir)
        test_sub_dir = os.path.join('images/test', sub_dir)
        val_sub_dir = os.path.join('images/val', sub_dir)

        if not os.path.exists(train_sub_dir):
            os.makedirs(train_sub_dir)
        
        if not os.path.exists(test_sub_dir):
            os.makedirs(test_sub_dir)

        if not os.path.exists(val_sub_dir):
            os.makedirs(val_sub_dir)

        # put all 'PGCname's from each sub_dir into lists
        
        # randomly sample names from list until train_size is 
        # met, do not replace
        
        # names in remaining list will be placed in test files
        
        # remove val_size of images from training, to val sub_dir
        
        # produce new file paths in all folders
        
        # shutil to copy the image into the new folder
