# North Carolina Teacher Attrition By Reason Data 2017-2018
* This program downloads all original teacher attrition rate by reason datasets from http://www.ncpublicschools.org/docs/district-humanresources/surveys/leaving/ and saves them as .csv files. These data files are used to create all the flattened and machine learning datasets
    * The data is allocated on the appendix of the yearly NC state teaching profession report PDF file page 27 to page 31. 
    * Each page has one table, the program loops though each page and extract the data to a list
    * Some LEA names are too long which will expand to multiple rows, I fixed them to original name as well.
    * The final result will be saved into an csv file into RawDataset folder.

In [1]:
import PyPDF2
import requests 
from tabula import read_pdf
import tabula
import pandas as pd
import urllib
import warnings
warnings.simplefilter('ignore')
warnings.simplefilter('ignore', DeprecationWarning)
pd.set_option('mode.chained_assignment', None)

In [3]:
PDFfilename = "2017-18-state-teaching-profession.pdf"

url = 'http://www.ncpublicschools.org/docs/district-humanresources/surveys/leaving/' + PDFfilename
r = requests.get(url)
# downlaod the file
with open(PDFfilename, 'wb') as f:  
    f.write(r.content)

pfr = PyPDF2.PdfFileReader(open(PDFfilename,"rb"))
NewPDFfilename = "TeacherAttrition_ByReasonsCategories"

files = [] 
for i in range(27,31):
    pg = pfr.getPage(i-1)
    writer = PyPDF2.PdfFileWriter() 
    writer.addPage(pg)
    PDF = NewPDFfilename + "_"+str(i) +".pdf"
    files.append(PDF)
    with open(PDF,"wb") as outputStream:
        writer.write(outputStream)


In [6]:
# read pdf multiple files into one data frame
df=pd.DataFrame([])
df_combine=pd.DataFrame([])
for file in files:
            df = tabula.read_pdf(file,multiple_tables=False)
            #If you want to change the table by editing the columns you can do that here.
            #print(df)
            df_combine=pd.concat([df,df_combine],axis=0,sort=False) #again you can choose between merge or concat as per your need

            
df_combine.columns = ['LEACode', 'LEAName', 'TotalTeachers', 'TeachersLeaving', 'PersonalReasons'
                            ,'PersonalPerct','InitiatedbyLEA','InitiatedPerct','BeyondControlofLEA','BeyondControlPerct'
                           ,'OtherReasons','OtherReasonsPerct']
# Remove header

exclude = ['LEA','Code']

df = df_combine[~df_combine['LEACode'].isin(exclude) & ~ df_combine['LEACode'].isna()] # remove LEA code NaN as well as some 
# fix the name cross multiple line
df.loc[df['LEACode']=='700','LEAName']='Elizabeth City-Pasquotank Public Schools'
df.loc[df['LEACode']=='94Z','LEAName']='Northeast Regional School - Biotech/Agri'
df.loc[df['LEACode']=='340','LEAName']='Winston Salem/Forsyth County Schools'

# save to csv file
csv = "../SchoolTeacherAttrition_ByReasons2017.csv"
df_sorted=df.sort_values(by='LEACode', ascending=True)

df_sorted.to_csv(csv,index=False)
