# Transcript processing

Handle transcripts in various formats

## Setup

Pdf tools for use on mac 
```
brew install poppler
```

requirements.txt
```
pillow
colorthief
pdftotext
pdf2img
pandas 
numpy
scikit-learn
joblib
```

## Preprocessing

Stage all sample files in `samples` subdir prior to running this. 

For now this is limited to pdf files, as all examples are pdf files.

Preprocessing will convert pdf pages to png files, and if text is also associated with the pdf it will store text.

A more advanced version of the text extraction would to store coordinates of words as well `python-poppler` for clustering (HAC).

In [3]:
import os
from pdf2image import convert_from_path
import pdftotext
import pandas as pd
import numpy as np

In [1]:
transcripts = [] 
def list_files_with_extension(directory, extension):
    files_with_extension = []
    for filename in os.listdir(directory):
        if filename.endswith(extension):
            files_with_extension.append(filename)
    return files_with_extension

folder = 'samples/'
files = list_files_with_extension(folder, 'pdf')
for file in files:
    filename = os.path.join(folder, file)
    transcript = {'source': file}
    pages = []
    try:
        images = convert_from_path(filename, output_folder=folder, fmt='png')
        for i, image in enumerate(images):
            pagename = f'{file}_{i+1}.png'
            image.save(os.path.join(folder, pagename), "PNG")
            pages.append(pagename)
    except:
        print(f'bad file image data: {file}')
    transcript['pages'] = pages

    text = ''
    try:
        with open(filename, "rb") as f:
            pdf = pdftotext.PDF(f)
            text = "\n".join(pdf)
    except:
        print(f'bad file text data: {file}')
    transcript['text'] = text
    transcripts.append(transcript)    
print(transcripts[0])

bad file image data: 1fec30ff-31af-47b0-95d9-7acd66551e29.pdf
{'source': 'R11963430-25840119-file0001.pdf', 'pages': ['R11963430-25840119-file0001.pdf_1.png', 'R11963430-25840119-file0001.pdf_2.png', 'R11963430-25840119-file0001.pdf_3.png'], 'text': '\x0c\n\x0c\n\x0c'}


In [2]:
from colorthief import ColorThief
from PIL import Image
import imagehash

folder = 'samples/'
for transcript in transcripts:
    palettes = []
    sizes = []
    hashes = []
    for page in transcript.get('pages'):
        page_filename = os.path.join(folder, page)
        color_thief = ColorThief(page_filename)
        palettes.append(color_thief.get_palette(color_count=5))
        image = Image.open(page_filename)
        sizes.append(image.size)
        hash = imagehash.average_hash(image)
        hashes.append(str(hash))
    transcript['palettes'] = palettes
    transcript['sizes'] = sizes
    transcript['hashes'] = hashes
print(transcripts[0])

{'source': 'R11963430-25840119-file0001.pdf', 'pages': ['R11963430-25840119-file0001.pdf_1.png', 'R11963430-25840119-file0001.pdf_2.png', 'R11963430-25840119-file0001.pdf_3.png'], 'text': '\x0c\n\x0c\n\x0c', 'palettes': [[(11, 13, 18), (219, 227, 230), (110, 82, 69), (122, 156, 179), (184, 136, 86)], [(14, 13, 14), (205, 215, 218), (103, 157, 198), (57, 101, 151), (145, 87, 51)], [(21, 19, 20), (196, 212, 216), (127, 161, 184), (62, 102, 151), (167, 88, 35)]], 'sizes': [(2200, 1700), (2200, 1700), (2200, 1700)], 'hashes': ['fe2e00feb7ffffff', 'fe2e00fefffffffe', 'fe2e00debfbfffff']}


In [80]:
labels_df = pd.read_csv('labels.csv')
labels_df['name'].value_counts()

name
East Side Union High School           5
East Union High School                4
Weber Technology High School          4
St. Mary's High School                3
Aspire Langston Hughes Academy        3
NYC Department Of Education           3
Lodi High School                      3
Coppel High School                    3
Plano West Senior High                2
ALLEN HIGH SCHOOL                     1
PROSPER HIGH SCHOOL                   1
Vandegrift High School                1
Westfield High School                 1
The Bronx High School of Science      1
The Urban Assembly NY Harbo School    1
Central Islip High School             1
Francis Lewis High School             1
Northport High School                 1
Name: count, dtype: int64

In [82]:
labels_df['ceeb_code'].value_counts()

ceeb_code
51976.0     4
53435.0     3
441485.0    3
311585.0    1
333480.0    1
331926.0    1
331290.0    1
330636.0    1
440077.0    1
440363.0    1
334205.0    1
Name: count, dtype: int64

In [139]:
import pandas as pd
df = pd.DataFrame(columns=['source', 'page_file', 'page', 'count', 'text', 'palette', 'hash', 'size', 'hs'])
i = 1
for row in transcripts:
    label_row = labels_df.loc[labels_df['source'] == row['source'], ['name']]
    print(label_row)
    for page, page_file in enumerate(row['pages']):
        df.loc[i] = [row['source'], page_file, page+1, len(row['pages']), 
                     row['text'], row['palettes'][page], row['hashes'][page], 
                     row['sizes'][page], label_row['name'].iloc[0,]]
        i += 1
df

                  name
31  Coppel High School
                           name
3  Weber Technology High School
                               name
1  The Bronx High School of Science
                name
25  Lodi High School
                  name
26  Coppel High School
                   name
30  PROSPER HIGH SCHOOL
                      name
14  St. Mary's High School
                      name
33  Plano West Senior High
                            name
22  Weber Technology High School
                             name
7  Aspire Langston Hughes Academy
                            name
18  Weber Technology High School
                name
21  Lodi High School
                name
20  Lodi High School
                           name
17  East Side Union High School
                      name
34  East Union High School
                      name
36  East Union High School
                           name
10  East Side Union High School
                      name
29  Vandegrift High School


Unnamed: 0,source,page_file,page,count,text,palette,hash,size,hs
1,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_1.png,1,3,\n\n,"[(11, 13, 18), (219, 227, 230), (110, 82, 69),...",fe2e00feb7ffffff,"(2200, 1700)","31 Coppel High School Name: name, dtype: ob..."
2,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_2.png,2,3,\n\n,"[(14, 13, 14), (205, 215, 218), (103, 157, 198...",fe2e00fefffffffe,"(2200, 1700)","31 Coppel High School Name: name, dtype: ob..."
3,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_3.png,3,3,\n\n,"[(21, 19, 20), (196, 212, 216), (127, 161, 184...",fe2e00debfbfffff,"(2200, 1700)","31 Coppel High School Name: name, dtype: ob..."
4,0e0af021-9b60-4672-bb35-c8b7f4269bc2.pdf,0e0af021-9b60-4672-bb35-c8b7f4269bc2.pdf_1.png,1,1,,"[(14, 17, 25), (211, 211, 206), (92, 72, 66), ...",3f1b8387e7fcfcfc,"(1700, 2200)","3 Weber Technology High School Name: name, ..."
5,0a8bbe6b-901b-4451-8cda-503f1b2a3c84.pdf,0a8bbe6b-901b-4451-8cda-503f1b2a3c84.pdf_1.png,1,2,\nAcademic Year: 2020-2021\nSubject\nFine Per...,"[(41, 41, 53), (233, 227, 209), (157, 195, 214...",0707ef8d898d8dbf,"(1700, 2200)",1 The Bronx High School of Science Name: na...
...,...,...,...,...,...,...,...,...,...
68,0fafd1ac-1f8e-4575-818b-6d72c9d347a6.pdf,0fafd1ac-1f8e-4575-818b-6d72c9d347a6.pdf_1.png,1,1,,"[(26, 25, 27), (211, 209, 200), (117, 169, 206...",8181fff1f3f1ffbf,"(1700, 2200)","4 St. Mary's High School Name: name, dtype:..."
69,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_1.png,1,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(193, 208, 225), (61, 65, 100), (159, 79, 76)...",0000bfa524ffffff,"(1700, 2200)","27 ALLEN HIGH SCHOOL Name: name, dtype: object"
70,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_2.png,2,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(206, 211, 216), (197, 83, 67), (63, 62, 88),...",ffe0e0f0f0f78000,"(1700, 2200)","27 ALLEN HIGH SCHOOL Name: name, dtype: object"
71,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_3.png,3,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(205, 204, 204), (27, 23, 27), (107, 104, 105...",2727f7070f0fffff,"(1700, 2200)","27 ALLEN HIGH SCHOOL Name: name, dtype: object"


In [120]:
THRESHOLD = 10
def is_grayscale(palette):
    for pal in palette:
        if (max(pal) - min(pal)) > THRESHOLD:
            return 0
    return 1
df['is_gray'] = df['palette'].apply(is_grayscale)
df['is_gray'].value_counts()

is_gray
0    48
1    24
Name: count, dtype: int64

In [121]:
def is_landscape(size):
    return 1 if size[1] < size[0] else 0
df['is_landscape'] = df['size'].apply(is_landscape)
df['is_landscape'].value_counts()

is_landscape
0    59
1    13
Name: count, dtype: int64

In [89]:
def has_text(text):
    return 1 if len(text) > 20 else 0
df['has_text'] = df['text'].apply(has_text)
df['has_text'].value_counts()

has_text
0    49
1    23
Name: count, dtype: int64

In [108]:
df.to_csv('transcripts_prep.csv', index=False)

In [4]:
# Can load df if starting from here
import ast
df = pd.read_csv('transcripts_prep.csv', converters={'palette': ast.literal_eval, 'size': ast.literal_eval})
df

Unnamed: 0,source,page_file,page,count,text,palette,hash,size,hs,is_gray,is_landscape,has_text
0,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_1.png,1,3,\n\n,"[(11, 13, 18), (219, 227, 230), (110, 82, 69),...",fe2e00feb7ffffff,"(2200, 1700)","31 Coppel High School\nName: name, dtype: o...",0,1,0
1,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_2.png,2,3,\n\n,"[(14, 13, 14), (205, 215, 218), (103, 157, 198...",fe2e00fefffffffe,"(2200, 1700)","31 Coppel High School\nName: name, dtype: o...",0,1,0
2,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_3.png,3,3,\n\n,"[(21, 19, 20), (196, 212, 216), (127, 161, 184...",fe2e00debfbfffff,"(2200, 1700)","31 Coppel High School\nName: name, dtype: o...",0,1,0
3,0e0af021-9b60-4672-bb35-c8b7f4269bc2.pdf,0e0af021-9b60-4672-bb35-c8b7f4269bc2.pdf_1.png,1,1,,"[(14, 17, 25), (211, 211, 206), (92, 72, 66), ...",3f1b8387e7fcfcfc,"(1700, 2200)","3 Weber Technology High School\nName: name,...",0,0,0
4,0a8bbe6b-901b-4451-8cda-503f1b2a3c84.pdf,0a8bbe6b-901b-4451-8cda-503f1b2a3c84.pdf_1.png,1,2,\nAcademic Year: 2020-2021\nSubject\nFine Per...,"[(41, 41, 53), (233, 227, 209), (157, 195, 214...",0707ef8d898d8dbf,"(1700, 2200)",1 The Bronx High School of Science\nName: n...,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
67,0fafd1ac-1f8e-4575-818b-6d72c9d347a6.pdf,0fafd1ac-1f8e-4575-818b-6d72c9d347a6.pdf_1.png,1,1,,"[(26, 25, 27), (211, 209, 200), (117, 169, 206...",8181fff1f3f1ffbf,"(1700, 2200)","4 St. Mary's High School\nName: name, dtype...",0,0,0
68,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_1.png,1,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(193, 208, 225), (61, 65, 100), (159, 79, 76)...",0000bfa524ffffff,"(1700, 2200)","27 ALLEN HIGH SCHOOL\nName: name, dtype: ob...",0,0,1
69,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_2.png,2,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(206, 211, 216), (197, 83, 67), (63, 62, 88),...",ffe0e0f0f0f78000,"(1700, 2200)","27 ALLEN HIGH SCHOOL\nName: name, dtype: ob...",0,0,1
70,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_3.png,3,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(205, 204, 204), (27, 23, 27), (107, 104, 105...",2727f7070f0fffff,"(1700, 2200)","27 ALLEN HIGH SCHOOL\nName: name, dtype: ob...",1,0,1


In [21]:
def flat(palette):
    return [item for sublist in palette for item in sublist]
df['flat'] = df['palette'].apply(flat)
df                                 

Unnamed: 0,source,page_file,page,count,text,palette,hash,size,hs,is_gray,is_landscape,has_text,label,flat
0,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_1.png,1,3,\n\n,"[(11, 13, 18), (219, 227, 230), (110, 82, 69),...",fe2e00feb7ffffff,"(2200, 1700)","31 Coppel High School\nName: name, dtype: o...",0,1,0,0,"[11, 13, 18, 219, 227, 230, 110, 82, 69, 122, ..."
1,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_2.png,2,3,\n\n,"[(14, 13, 14), (205, 215, 218), (103, 157, 198...",fe2e00fefffffffe,"(2200, 1700)","31 Coppel High School\nName: name, dtype: o...",0,1,0,0,"[14, 13, 14, 205, 215, 218, 103, 157, 198, 57,..."
2,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_3.png,3,3,\n\n,"[(21, 19, 20), (196, 212, 216), (127, 161, 184...",fe2e00debfbfffff,"(2200, 1700)","31 Coppel High School\nName: name, dtype: o...",0,1,0,0,"[21, 19, 20, 196, 212, 216, 127, 161, 184, 62,..."
3,0e0af021-9b60-4672-bb35-c8b7f4269bc2.pdf,0e0af021-9b60-4672-bb35-c8b7f4269bc2.pdf_1.png,1,1,,"[(14, 17, 25), (211, 211, 206), (92, 72, 66), ...",3f1b8387e7fcfcfc,"(1700, 2200)","3 Weber Technology High School\nName: name,...",0,0,0,1,"[14, 17, 25, 211, 211, 206, 92, 72, 66, 111, 1..."
4,0a8bbe6b-901b-4451-8cda-503f1b2a3c84.pdf,0a8bbe6b-901b-4451-8cda-503f1b2a3c84.pdf_1.png,1,2,\nAcademic Year: 2020-2021\nSubject\nFine Per...,"[(41, 41, 53), (233, 227, 209), (157, 195, 214...",0707ef8d898d8dbf,"(1700, 2200)",1 The Bronx High School of Science\nName: n...,0,0,1,2,"[41, 41, 53, 233, 227, 209, 157, 195, 214, 152..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0fafd1ac-1f8e-4575-818b-6d72c9d347a6.pdf,0fafd1ac-1f8e-4575-818b-6d72c9d347a6.pdf_1.png,1,1,,"[(26, 25, 27), (211, 209, 200), (117, 169, 206...",8181fff1f3f1ffbf,"(1700, 2200)","4 St. Mary's High School\nName: name, dtype...",0,0,0,5,"[26, 25, 27, 211, 209, 200, 117, 169, 206, 76,..."
68,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_1.png,1,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(193, 208, 225), (61, 65, 100), (159, 79, 76)...",0000bfa524ffffff,"(1700, 2200)","27 ALLEN HIGH SCHOOL\nName: name, dtype: ob...",0,0,1,17,"[193, 208, 225, 61, 65, 100, 159, 79, 76, 157,..."
69,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_2.png,2,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(206, 211, 216), (197, 83, 67), (63, 62, 88),...",ffe0e0f0f0f78000,"(1700, 2200)","27 ALLEN HIGH SCHOOL\nName: name, dtype: ob...",0,0,1,17,"[206, 211, 216, 197, 83, 67, 63, 62, 88, 140, ..."
70,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_3.png,3,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(205, 204, 204), (27, 23, 27), (107, 104, 105...",2727f7070f0fffff,"(1700, 2200)","27 ALLEN HIGH SCHOOL\nName: name, dtype: ob...",1,0,1,17,"[205, 204, 204, 27, 23, 27, 107, 104, 105, 140..."


In [25]:
lists = []
for flat in df['flat']:
    lists.append(np.array(flat))

In [28]:
from sklearn.metrics.pairwise import cosine_similarity
similarity_matrix = cosine_similarity(lists)
query_index = 0
similar_indices = np.argsort(similarity_matrix[query_index])[::-1]  # Sort in descending order
print("Most similar documents to:", query_index)
for i in similar_indices[1:]:  # Exclude the document itself
    print(f"Document {i+1}: {i}")

Most similar documents to: 0
Document 44: 43
Document 10: 9
Document 8: 7
Document 22: 21
Document 40: 39
Document 4: 3
Document 43: 42
Document 23: 22
Document 65: 64
Document 63: 62
Document 27: 26
Document 26: 25
Document 54: 53
Document 55: 54
Document 25: 24
Document 15: 14
Document 18: 17
Document 67: 66
Document 58: 57
Document 6: 5
Document 24: 23
Document 17: 16
Document 37: 36
Document 41: 40
Document 28: 27
Document 29: 28
Document 33: 32
Document 64: 63
Document 21: 20
Document 61: 60
Document 19: 18
Document 39: 38
Document 51: 50
Document 3: 2
Document 68: 67
Document 36: 35
Document 14: 13
Document 56: 55
Document 2: 1
Document 16: 15
Document 7: 6
Document 5: 4
Document 66: 65
Document 13: 12
Document 12: 11
Document 70: 69
Document 46: 45
Document 42: 41
Document 69: 68
Document 59: 58
Document 31: 30
Document 52: 51
Document 60: 59
Document 32: 31
Document 53: 52
Document 30: 29
Document 50: 49
Document 71: 70
Document 49: 48
Document 20: 19
Document 57: 56
Document 4

In [29]:
def hex_string_to_vector(hex_string):
    return np.array([int(hex_string[i:i+2], 16) for i in range(0, len(hex_string), 2)])
df['hash_vector'] = df['hash'].apply(hex_string_to_vector)
hash_lists = []
for hv in df['hash_vector']:
    hash_lists.append(np.array(hv))
hash_similarity_matrix = cosine_similarity(hash_lists)
query_index = 0
hash_similar_indices = np.argsort(hash_similarity_matrix[query_index])[::-1]  # Sort in descending order
print("Most similar documents to:", query_index)
for i in hash_similar_indices[1:]:  # Exclude the document itself
    print(f"Document {i+1}: {i}")

Most similar documents to: 0
Document 44: 43
Document 9: 8
Document 34: 33
Document 46: 45
Document 36: 35
Document 11: 10
Document 10: 9
Document 3: 2
Document 2: 1
Document 8: 7
Document 45: 44
Document 35: 34
Document 23: 22
Document 38: 37
Document 37: 36
Document 62: 61
Document 41: 40
Document 63: 62
Document 43: 42
Document 55: 54
Document 56: 55
Document 4: 3
Document 20: 19
Document 24: 23
Document 42: 41
Document 48: 47
Document 16: 15
Document 25: 24
Document 68: 67
Document 27: 26
Document 14: 13
Document 7: 6
Document 72: 71
Document 65: 64
Document 18: 17
Document 32: 31
Document 53: 52
Document 60: 59
Document 39: 38
Document 17: 16
Document 69: 68
Document 67: 66
Document 51: 50
Document 66: 65
Document 29: 28
Document 61: 60
Document 26: 25
Document 47: 46
Document 58: 57
Document 33: 32
Document 57: 56
Document 6: 5
Document 28: 27
Document 40: 39
Document 70: 69
Document 22: 21
Document 12: 11
Document 13: 12
Document 50: 49
Document 52: 51
Document 5: 4
Document 54:

In [31]:
all_vectors = []
for index, row in df.iterrows():
    all_feat = []
    all_feat.extend(row['hash_vector'])
    all_feat.extend([row['is_gray'], row['is_landscape']])
    all_feat.extend(row['flat'])
    all_vectors.append(np.array(all_feat))

hash_similarity_matrix = cosine_similarity(all_vectors)
query_index = 0
hash_similar_indices = np.argsort(hash_similarity_matrix[query_index])[::-1]  # Sort in descending order
for i in hash_similar_indices[1:]:  # Exclude the document itself
    if i+1 < len(hash_similar_indices):
        print(f"Document {df.at[i+1, 'page_file']}: {df.at[i, 'page_file']}")

Document R11965041-25855700-file0001.pdf_2.png: R11965041-25855700-file0001.pdf_1.png
Document R11404893-11488889-file0001.pdf_4.png: R11404893-11488889-file0001.pdf_3.png
Document R11404893-11488889-file0001.pdf_2.png: R11404893-11488889-file0001.pdf_1.png
Document e34f4f36-35d7-4021-be41-27618f5f47f8.pdf_1.png: R11961143-25845812-file0001.pdf_3.png
Document 0e0af021-9b60-4672-bb35-c8b7f4269bc2.pdf_1.png: R11963430-25840119-file0001.pdf_3.png
Document R11963430-25840119-file0001.pdf_3.png: R11963430-25840119-file0001.pdf_2.png
Document 4a3ace61-62b2-44de-be88-090c81a74eb9.pdf_3.png: 4a3ace61-62b2-44de-be88-090c81a74eb9.pdf_2.png
Document R11965041-25855700-file0001.pdf_1.png: 34667a1d-8603-4a02-b3de-9e713c126937.pdf_1.png
Document 5afad5f9-744c-4153-8082-4ac133c87924.pdf_1.png: fcf57acf-8972-4611-9b28-082456653392.pdf_2.png
Document 0a8bbe6b-901b-4451-8cda-503f1b2a3c84.pdf_1.png: 0e0af021-9b60-4672-bb35-c8b7f4269bc2.pdf_1.png
Document 3fc3765e-30ec-455d-b652-3e9184f51668.pdf_1.png: 8a

In [33]:
from sklearn.cluster import KMeans

kmeans = KMeans(n_clusters=29, random_state=0, n_init="auto").fit(all_vectors)
kmeans.cluster_centers_

array([[ 1.40000000e+01,  2.95000000e+01,  1.57000000e+02,
         6.00000000e+01,  6.00000000e+01,  1.91000000e+02,
         6.70000000e+01,  1.91000000e+02,  5.00000000e-01,
         0.00000000e+00,  1.90000000e+01,  2.05000000e+01,
         2.55000000e+01,  2.13500000e+02,  2.13500000e+02,
         2.10000000e+02,  1.21000000e+02,  1.03500000e+02,
         9.60000000e+01,  1.29000000e+02,  1.43500000e+02,
         1.54500000e+02,  1.51500000e+02,  1.28000000e+02,
         1.03500000e+02],
       [ 1.58000000e+01,  1.62000000e+01,  2.21000000e+02,
         5.82000000e+01,  1.27000000e+02,  5.14000000e+01,
         2.55000000e+02,  2.55000000e+02,  8.00000000e-01,
        -2.77555756e-17,  2.09400000e+02,  2.08000000e+02,
         2.04400000e+02,  5.18000000e+01,  5.10000000e+01,
         5.42000000e+01,  1.14200000e+02,  1.21000000e+02,
         1.26400000e+02,  1.35400000e+02,  1.28800000e+02,
         1.24400000e+02,  1.12600000e+02,  1.22400000e+02,
         1.31600000e+02],
    

In [34]:
df['hs'][1]

'31    Coppel High School\nName: name, dtype: object'

In [62]:
labels = []
yy = []
def label_hs(hs):
    # clean, as got messed up 
    hmore = hs.split('\n')
    hs = hmore[0].strip()
    hs = hs.split()
    hs = ' '.join(hs[1:])
    if hs not in labels:
        labels.append(hs)
    y = labels.index(hs)
    yy.append(y)
    return y
df['label'] = df['hs'].apply(label_hs)
print(labels)
y = np.array(yy)
print(y)
df['label'].value_counts()

['Coppel High School', 'Weber Technology High School', 'The Bronx High School of Science', 'Lodi High School', 'PROSPER HIGH SCHOOL', "St. Mary's High School", 'Plano West Senior High', 'Aspire Langston Hughes Academy', 'East Side Union High School', 'East Union High School', 'Vandegrift High School', 'Westfield High School', 'NYC Department Of Education', 'Central Islip High School', 'The Urban Assembly NY Harbo School', 'Northport High School', 'Francis Lewis High School', 'ALLEN HIGH SCHOOL']
[ 0  0  0  1  2  2  3  0  0  0  0  4  4  5  6  6  6  6  6  1  7  1  1  1
  1  3  3  8  9  9  9  9  8 10 10 10  8 11 12 13  8  1  8  0  0  0  6  6
  6  6 12  9  9  7  7  5 14 14  9  9 12 15 15  7  7 16 16  5 17 17 17 17]


label
0     10
6      9
9      8
1      7
7      5
8      5
17     4
3      3
5      3
10     3
12     3
4      2
2      2
14     2
15     2
16     2
11     1
13     1
Name: count, dtype: int64

In [75]:
XX = []
def process_cols(row):
    x = list(row['hash_vector']) + [row['is_gray']] + [row['is_landscape']] + list(row['flat'])
    XX.extend(x)
    return np.array(x)
df['all'] = df.apply(lambda row: process_cols(row), axis=1)
df

Unnamed: 0,source,page_file,page,count,text,palette,hash,size,hs,is_gray,is_landscape,has_text,label,flat,hash_vector,all
0,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_1.png,1,3,\n\n,"[(11, 13, 18), (219, 227, 230), (110, 82, 69),...",fe2e00feb7ffffff,"(2200, 1700)","31 Coppel High School\nName: name, dtype: o...",0,1,0,0,"[11, 13, 18, 219, 227, 230, 110, 82, 69, 122, ...","[254, 46, 0, 254, 183, 255, 255, 255]","[254, 46, 0, 254, 183, 255, 255, 255, 0, 1, 11..."
1,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_2.png,2,3,\n\n,"[(14, 13, 14), (205, 215, 218), (103, 157, 198...",fe2e00fefffffffe,"(2200, 1700)","31 Coppel High School\nName: name, dtype: o...",0,1,0,0,"[14, 13, 14, 205, 215, 218, 103, 157, 198, 57,...","[254, 46, 0, 254, 255, 255, 255, 254]","[254, 46, 0, 254, 255, 255, 255, 254, 0, 1, 14..."
2,R11963430-25840119-file0001.pdf,R11963430-25840119-file0001.pdf_3.png,3,3,\n\n,"[(21, 19, 20), (196, 212, 216), (127, 161, 184...",fe2e00debfbfffff,"(2200, 1700)","31 Coppel High School\nName: name, dtype: o...",0,1,0,0,"[21, 19, 20, 196, 212, 216, 127, 161, 184, 62,...","[254, 46, 0, 222, 191, 191, 255, 255]","[254, 46, 0, 222, 191, 191, 255, 255, 0, 1, 21..."
3,0e0af021-9b60-4672-bb35-c8b7f4269bc2.pdf,0e0af021-9b60-4672-bb35-c8b7f4269bc2.pdf_1.png,1,1,,"[(14, 17, 25), (211, 211, 206), (92, 72, 66), ...",3f1b8387e7fcfcfc,"(1700, 2200)","3 Weber Technology High School\nName: name,...",0,0,0,1,"[14, 17, 25, 211, 211, 206, 92, 72, 66, 111, 1...","[63, 27, 131, 135, 231, 252, 252, 252]","[63, 27, 131, 135, 231, 252, 252, 252, 0, 0, 1..."
4,0a8bbe6b-901b-4451-8cda-503f1b2a3c84.pdf,0a8bbe6b-901b-4451-8cda-503f1b2a3c84.pdf_1.png,1,2,\nAcademic Year: 2020-2021\nSubject\nFine Per...,"[(41, 41, 53), (233, 227, 209), (157, 195, 214...",0707ef8d898d8dbf,"(1700, 2200)",1 The Bronx High School of Science\nName: n...,0,0,1,2,"[41, 41, 53, 233, 227, 209, 157, 195, 214, 152...","[7, 7, 239, 141, 137, 141, 141, 191]","[7, 7, 239, 141, 137, 141, 141, 191, 0, 0, 41,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67,0fafd1ac-1f8e-4575-818b-6d72c9d347a6.pdf,0fafd1ac-1f8e-4575-818b-6d72c9d347a6.pdf_1.png,1,1,,"[(26, 25, 27), (211, 209, 200), (117, 169, 206...",8181fff1f3f1ffbf,"(1700, 2200)","4 St. Mary's High School\nName: name, dtype...",0,0,0,5,"[26, 25, 27, 211, 209, 200, 117, 169, 206, 76,...","[129, 129, 255, 241, 243, 241, 255, 191]","[129, 129, 255, 241, 243, 241, 255, 191, 0, 0,..."
68,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_1.png,1,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(193, 208, 225), (61, 65, 100), (159, 79, 76)...",0000bfa524ffffff,"(1700, 2200)","27 ALLEN HIGH SCHOOL\nName: name, dtype: ob...",0,0,1,17,"[193, 208, 225, 61, 65, 100, 159, 79, 76, 157,...","[0, 0, 191, 165, 36, 255, 255, 255]","[0, 0, 191, 165, 36, 255, 255, 255, 0, 0, 193,..."
69,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_2.png,2,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(206, 211, 216), (197, 83, 67), (63, 62, 88),...",ffe0e0f0f0f78000,"(1700, 2200)","27 ALLEN HIGH SCHOOL\nName: name, dtype: ob...",0,0,1,17,"[206, 211, 216, 197, 83, 67, 63, 62, 88, 140, ...","[255, 224, 224, 240, 240, 247, 128, 0]","[255, 224, 224, 240, 240, 247, 128, 0, 0, 0, 2..."
70,R11960756-25801269-file0001.pdf,R11960756-25801269-file0001.pdf_3.png,3,4,\n\nSTATE OF TEXAS\nACADEMIC ACHIEVEMENT REC...,"[(205, 204, 204), (27, 23, 27), (107, 104, 105...",2727f7070f0fffff,"(1700, 2200)","27 ALLEN HIGH SCHOOL\nName: name, dtype: ob...",1,0,1,17,"[205, 204, 204, 27, 23, 27, 107, 104, 105, 140...","[39, 39, 247, 7, 15, 15, 255, 255]","[39, 39, 247, 7, 15, 15, 255, 255, 1, 0, 205, ..."


In [76]:
X = np.array(XX)
X = X.reshape((int(len(XX)/25), 25))
X

array([[254,  46,   0, ..., 184, 136,  86],
       [254,  46,   0, ..., 145,  87,  51],
       [254,  46,   0, ..., 167,  88,  35],
       ...,
       [255, 224, 224, ..., 130, 110, 110],
       [ 39,  39, 247, ..., 133, 132, 132],
       [ 33,  37, 255, ..., 125, 124, 124]], shape=(72, 25))

In [95]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
model = LogisticRegression(solver='lbfgs', max_iter=1000)
model.fit(X_train, y_train)

In [96]:
from sklearn.metrics import accuracy_score
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:02%}")

Accuracy: 55.555556%


In [97]:
from joblib import dump
dump(model, 'transcrip_LR.joblib')

['transcrip_LR.joblib']

## Performance

- 17 labels, random for each label would be 6% accuracy.
- If we just guessed the high school with the highest count of 10 of the 72 examples that would be 14% accuracy.
- So 56% is actually pretty good, given the very small sample set

## Inference

All above pre-processing steps, and creation of features needs to be done for a new image.

Then the model can be loaded, and the school inferred.

In [99]:
coppel = XX[:25]
prob = model.predict_proba(np.array([coppel]))
print(prob)

[[9.99289141e-01 1.25494016e-04 2.24425761e-11 5.30037215e-14
  1.99181876e-10 5.90366699e-08 7.53757063e-18 3.22323491e-07
  7.83877741e-20 5.84982304e-04 1.54829505e-13 4.72274021e-11
  4.50658129e-13 5.71020276e-11 5.18505539e-10 4.25079749e-15
  3.46369821e-14]]


In [105]:
max_value = max(prob[0])
idx = list(prob[0]).index(max_value)
print(f'{labels[idx]}: {max_value:02%}')

Coppel High School: 99.928914%


In [106]:
with open('labels.txt', "w") as file:
    for item in labels:
        file.write(item + "\n")
