#### Importing all the modules

In [57]:
import cv2
import pytesseract
import os
import pandas as pd
import re
# for natural language processing: named entity recognition
import spacy
from collections import Counter
import en_core_web_sm

#### Set a path for tesseract executable

In [2]:
pytesseract.pytesseract.tesseract_cmd=r'C:/Program Files/Tesseract-OCR/tesseract.exe'

#### Image read

In [9]:
img = cv2.imread("images/in.jpg")

In [10]:
img

array([[[119,  88,  95],
        [118,  87,  94],
        [117,  86,  93],
        ...,
        [129,  89,  71],
        [130,  90,  72],
        [130,  90,  72]],

       [[121,  90,  97],
        [120,  89,  96],
        [119,  88,  95],
        ...,
        [129,  89,  71],
        [129,  89,  71],
        [129,  89,  71]],

       [[123,  92,  99],
        [123,  92,  99],
        [121,  90,  97],
        ...,
        [128,  88,  70],
        [128,  88,  70],
        [129,  89,  71]],

       ...,

       [[160, 149, 159],
        [160, 149, 159],
        [160, 149, 159],
        ...,
        [106,  86,  81],
        [104,  84,  79],
        [103,  83,  78]],

       [[163, 152, 162],
        [163, 152, 162],
        [162, 151, 161],
        ...,
        [104,  84,  79],
        [102,  82,  77],
        [101,  81,  76]],

       [[165, 154, 164],
        [164, 153, 163],
        [164, 153, 163],
        ...,
        [103,  83,  78],
        [101,  81,  76],
        [100,  80,  75]]

In [14]:
text = pytesseract.image_to_string(img)
text

''

In [15]:
print(text)




#### Image to text

In [17]:
img = cv2.imread("images/in1.jpg")
img = cv2.resize(img, (400, 450))
cv2.imshow("Image", img)
text = pytesseract.image_to_string(img)
print(text)
cv2.waitKey(0)
cv2.destroyAllWindows()

“Not in doing what
you like, but in liking
what you do is the
secret of happiness.”
—J.M. BARRIE



In [21]:
import os
folderPath = "images"
myRevList = os.listdir(folderPath)

In [22]:
for image in  myRevList:
    img = cv2.imread(f'{folderPath}/{image}')
    cv2.imshow("Image", img)
    cv2.waitKey(0)
    cv2.destroyAllWindows()

In [23]:
corpus = []
for images in myRevList:
    img = cv2.imread(f'{folderPath}/{images}')
    if img is None:
        corpus.append("Could not read the image.")
    else:
        rev = pytesseract.image_to_string(img)
        corpus.append(rev)
list(corpus)
corpus

['',
 '“Not in doing what\nyou like, but in liking\nwhat you do is the\nsecret of happiness.”\n—].M. BARRIE\n',
 'The cheat sheet\n\nPssst... Just like Sophie, involve\nStories of wolves and songs about\nrobots to explain a grammar lesson...\nLet your pupils express themselves\nand make learning inside the\nclassroom even more fun!\n\n',
 "I'm sterile. | didn’t say it to my first girlfriend\nfor a long time because | was afraid she would\nleave me. But when the truth was revealed,\nshe actually did. Recently, | met a girl,\nfell in love with her like crazy, and kept my\nproblem silent. But yesterday | told her\neverything. She looked at me and said,\n",
 '"Storytelling is a contextual\nbridge between play and\n\nwritten narrative.”\n\n(Bruner, 1986, 1990; Saracho and Spadek, 2006)\n\n',
 '“Ava uses Special Stories to create a story\nof her day which we look at when she gets\nhome. It’s a wonderful way to explore the\nworld and encourage children to open up\nabout the way they interact 

In [25]:

data = pd.DataFrame(list(corpus), columns=['Review'])
data

Unnamed: 0,Review
0,
1,"“Not in doing what\nyou like, but in liking\nw..."
2,"The cheat sheet\n\nPssst... Just like Sophie, ..."
3,I'm sterile. | didn’t say it to my first girlf...
4,"""Storytelling is a contextual\nbridge between ..."
5,“Ava uses Special Stories to create a story\no...
6,Characterize the camera’s\nability to read int...


In [26]:
#removing special characters

def clean(text):
    return re.sub('[^A-Za-z0-9" "]+', ' ', text)
data['Cleaned Review'] = data['Review'].apply(clean)
data

Unnamed: 0,Review,Cleaned Review
0,,
1,"“Not in doing what\nyou like, but in liking\nw...",Not in doing what you like but in liking wha...
2,"The cheat sheet\n\nPssst... Just like Sophie, ...",The cheat sheet Pssst Just like Sophie invol...
3,I'm sterile. | didn’t say it to my first girlf...,I m sterile didn t say it to my first girlf...
4,"""Storytelling is a contextual\nbridge between ...","""Storytelling is a contextual bridge between p..."
5,“Ava uses Special Stories to create a story\no...,Ava uses Special Stories to create a story of...
6,Characterize the camera’s\nability to read int...,Characterize the camera s ability to read inte...


In [28]:
data['Cleaned Review']

0                                                     
1     Not in doing what you like  but in liking wha...
2    The cheat sheet Pssst  Just like Sophie  invol...
3    I m sterile    didn t say it to my first girlf...
4    "Storytelling is a contextual bridge between p...
5     Ava uses Special Stories to create a story of...
6    Characterize the camera s ability to read inte...
Name: Cleaned Review, dtype: object

In [45]:
# for manipulating dataframes
import pandas as pd

# for visualizations
#%matplotlib inline

#### NER

In [51]:
tokens = nlp(''.join(str(data['Cleaned Review'].tolist())))
tokens

['', ' Not in doing what you like  but in liking what you do is the secret of happiness M  BARRIE ', 'The cheat sheet Pssst  Just like Sophie  involve Stories of wolves and songs about robots to explain a grammar lesson Let your pupils express themselves and make learning inside the classroom even more fun ', 'I m sterile    didn t say it to my first girlfriend for a long time because   was afraid she would leave me  But when the truth was revealed she actually did  Recently    met a girl fell in love with her like crazy  and kept my problem silent  But yesterday   told her everything  She looked at me and said ', '"Storytelling is a contextual bridge between play and written narrative Bruner  1986  1990  Saracho and Spadek  2006 ', ' Ava uses Special Stories to create a story of her day which we look at when she gets home  It s a wonderful way to explore the world and encourage children to open up about the way they interact with the environment around them Denise Wilson Bainbridge  S

In [52]:
items = [x.text for x in tokens.ents]
Counter(items).most_common(20)

[('Sophie', 1),
 ('first', 1),
 ('yesterday', 1),
 ('Storytelling', 1),
 ('Bruner', 1),
 ('1986  1990', 1),
 ('Saracho', 1),
 ('Spadek', 1),
 ('2006', 1),
 ('Denise Wilson Bainbridge', 1)]

In [53]:
person_list = []
for ent in tokens.ents:
    if ent.label_ == 'PERSON':
        person_list.append(ent.text)
        
person_counts = Counter(person_list).most_common(20)
df_person = pd.DataFrame(person_counts, columns =['text', 'count'])

In [54]:
df_person

Unnamed: 0,text,count
0,Denise Wilson Bainbridge,1
