Skip to content

Commit

Permalink
Merge pull request #5 from tarundhankhar/tarun/aadhaar
Browse files Browse the repository at this point in the history
Tarun/aadhaar
  • Loading branch information
SourabhJaz committed Apr 5, 2022
2 parents a7d3d9c + ff78840 commit 1cc9b90
Show file tree
Hide file tree
Showing 4 changed files with 228 additions and 31 deletions.
107 changes: 107 additions & 0 deletions src/_tests_/aadhaar_parser.mock.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
const DOCUMENT_DETAILS_BASE = {
"date_of_birth": undefined,
"fathers_name": undefined,
"gender": undefined,
"identification_number": undefined,
"name": undefined,
"document_type": undefined,
"address": undefined,
};

export default {
RAW_TEXTS: {
NON_ENGLISH: [
"भारतीय विशिष्ट पहचान प्राधिकरण",
"ARDNAAR",
"GUIGUEL SUATAN SUDHORITY OF INDIA",
"Address:",
"முகவரி:",
"3/1, M G R.",
"3/1, எம்.ஜி. NAGAR. MANALURPET,",
"ஆர் நகர், மணலூர்பேட்டை Manafurpet, Viluppuram,",
"மணலூர்பேட்டை,",
"விழுப்புரம்,",
"தமிழ் நாடு - 4",
"TamilNadu-605754",
],
FIRST_OCCURING_PIN_CODE: [
"UNIQUE IDENTIFICATION AUTHORITY OF INDIA",
"Address: Pawar Vadi Panchak,",
"Relavay Lain Jawal, Jail Road.",
"Nashik Road, Nashik Road,",
"Nashik, Maharashtra, 422101",
"Bengaluru-580001",
"आधार",
"पता पवार वाडी पंचक, रेलवे लाईन",
"जवळ, जेल रोड, नाशिक रोड, नाशिक",
"रोड, नाशिक, महाराष्ट्र, 422101",
],
GUARDIAN_NAME_ADDRESS_HEADER: [
"Unique Identification Authority of India",
"Address: S/O Subhash, B-260, SECTOR-3,",
"PHASE-3, DWARKA, South West Delhi,",
"Delhi, 110078",
],
UNWANTED_PREFIX_SUFFIX: [
"Unique Identification Authority of India",
"Address: -B-260, SECTOR-3,",
"PHASE-3, DWARKA, South West Delhi,",
"Delhi, 110078",
],
UNDEFINED_ADDRESS: [
"Unique Identification Authority of India",
"Address: B-260, SECTOR-3,",
"PHASE-3, DWARKA, South West Delhi,",
"Delhi, 110",
],
ADDRESS_END_LINE: [
"Unique Identification Authority of India",
"Address: B-260, SECTOR-3,",
"PHASE-3, DWARKA, South West Delhi,",
"Delhi-110078. ",
],
ADDRESS_START_LINE: [
"Address: B-260, SECTOR-3,",
"PHASE-3, DWARKA, South West Delhi,",
"Delhi, 110078",
]
},
PARSED_DETAILS: {
NON_ENGLISH: {
...DOCUMENT_DETAILS_BASE,
"document_type": "AADHAAR_CARD",
"address": "3/1, M G NAGAR. MANALURPET, Manafurpet, Viluppuram, TamilNadu-605754",
},
FIRST_OCCURING_PIN_CODE: {
...DOCUMENT_DETAILS_BASE,
"document_type": "AADHAAR_CARD",
"address": "Pawar Vadi Panchak Relavay Lain Jawal, Jail Road. Nashik Road, Nashik Road, Nashik, Maharashtra, 422101",
"fathers_name": "Pawar Vadi Panchak"
},
GUARDIAN_NAME_ADDRESS_HEADER: {
...DOCUMENT_DETAILS_BASE,
"document_type": "AADHAAR_CARD",
"address": "S/O Subhash B-260 SECTOR-3 PHASE-3, DWARKA, South West Delhi, Delhi, 110078",
"fathers_name": "Subhash"
},
UNWANTED_PREFIX_SUFFIX: {
...DOCUMENT_DETAILS_BASE,
"document_type": "AADHAAR_CARD",
"address": "B-260 SECTOR-3 PHASE-3, DWARKA, South West Delhi, Delhi, 110078",
},
UNDEFINED_ADDRESS: {
...DOCUMENT_DETAILS_BASE,
"document_type": "AADHAAR_CARD",
},
ADDRESS_END_LINE: {
...DOCUMENT_DETAILS_BASE,
"document_type": "AADHAAR_CARD",
"address": "B-260 SECTOR-3 PHASE-3, DWARKA, South West Delhi, Delhi-110078",
},
ADDRESS_START_LINE: {
...DOCUMENT_DETAILS_BASE,
"document_type": "AADHAAR_CARD",
"address": "B-260 SECTOR-3 PHASE-3, DWARKA, South West Delhi, Delhi, 110078",
}
}
};
66 changes: 66 additions & 0 deletions src/_tests_/aadhaar_parser.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import AadhaarParser from "../document-parser/aadhaar-parser";
import AADHAAR_PARSER_MOCKS from "./aadhaar_parser.mock";

test('Address parser should remove all the non-english text segments', () => {
const parsedDetails = AadhaarParser.parseDocumentDetails({
raw_text: AADHAAR_PARSER_MOCKS.RAW_TEXTS.NON_ENGLISH
});
expect(parsedDetails).toMatchObject({
is_document_valid: true,
document_details: AADHAAR_PARSER_MOCKS.PARSED_DETAILS.NON_ENGLISH
})
});
test('Address parser should mark address end at the first occurance of pin code', () => {
const parsedDetails = AadhaarParser.parseDocumentDetails({
raw_text: AADHAAR_PARSER_MOCKS.RAW_TEXTS.FIRST_OCCURING_PIN_CODE
});
expect(parsedDetails).toMatchObject({
is_document_valid: true,
document_details: AADHAAR_PARSER_MOCKS.PARSED_DETAILS.FIRST_OCCURING_PIN_CODE
})
});
test('Address parser should keep the address header starting with S/O, C/O, D/O etc.', () => {
const parsedDetails = AadhaarParser.parseDocumentDetails({
raw_text: AADHAAR_PARSER_MOCKS.RAW_TEXTS.GUARDIAN_NAME_ADDRESS_HEADER
});
expect(parsedDetails).toMatchObject({
is_document_valid: true,
document_details: AADHAAR_PARSER_MOCKS.PARSED_DETAILS.GUARDIAN_NAME_ADDRESS_HEADER
})
});
test('Address parser should remove the unwanted prefix or suffix noise from the address', () => {
const parsedDetails = AadhaarParser.parseDocumentDetails({
raw_text: AADHAAR_PARSER_MOCKS.RAW_TEXTS.UNWANTED_PREFIX_SUFFIX
});
expect(parsedDetails).toMatchObject({
is_document_valid: true,
document_details: AADHAAR_PARSER_MOCKS.PARSED_DETAILS.UNWANTED_PREFIX_SUFFIX
})
});
test('Address parser should return undefined address if the end line or start line of address is not identified', () => {
const parsedDetails = AadhaarParser.parseDocumentDetails({
raw_text: AADHAAR_PARSER_MOCKS.RAW_TEXTS.UNDEFINED_ADDRESS
});
expect(parsedDetails).toMatchObject({
is_document_valid: true,
document_details: AADHAAR_PARSER_MOCKS.PARSED_DETAILS.UNDEFINED_ADDRESS
})
});
test('Address parser should identify the end line of address by pin code, even if it is followed by some unwanted characters', () => {
const parsedDetails = AadhaarParser.parseDocumentDetails({
raw_text: AADHAAR_PARSER_MOCKS.RAW_TEXTS.ADDRESS_END_LINE
});
expect(parsedDetails).toMatchObject({
is_document_valid: true,
document_details: AADHAAR_PARSER_MOCKS.PARSED_DETAILS.ADDRESS_END_LINE
})
});
test('Address parser should identify the start line of address, even if it is first line of the raw text', () => {
const parsedDetails = AadhaarParser.parseDocumentDetails({
raw_text: AADHAAR_PARSER_MOCKS.RAW_TEXTS.ADDRESS_START_LINE
});
expect(parsedDetails).toMatchObject({
is_document_valid: true,
document_details: AADHAAR_PARSER_MOCKS.PARSED_DETAILS.ADDRESS_START_LINE
})
});
78 changes: 52 additions & 26 deletions src/document-parser/aadhaar-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,15 @@ const AADHAAR_REGEX = {
document: /Aadhaar|AADHAAR/,
number_format: /[\d\s]{12,}/,
name_format: /^[a-zA-Z\s\.]+$/,
address_start: /([Ss]\/[Oo])|([Ww]\/[Oo])|([Dd]\/[Oo])|([Cc]\/[Oo])|(Address|ADDRESS)/,
address_start: /([Ss]\/[Oo])|([Ww]\/[Oo])|([Dd]\/[Oo])|([Cc]\/[Oo])|(Address[:\s]*)|(ADDRESS[:\s]*)/,
address_head: /Address[:\s]*|ADDRESS[:\s]*/,
address_start_split: /,/,
noise: /(^[\s]+$)|(^[A-Z]{0,2}[.,]+$)|(^[a-z])|(^[A-Z0-9]{2,}[a-z]+)|(^[A-Z0-9]+[a-z]+[A-Z]+)|(^[A-Z0-9]+[.,]+[A-Z0-9]+)|(^[0-9]+[a-zA-Z]{2,})/,
address_end: /([A-Z\s]+[a-z]*[,-\s]+[0-9]{6}$)|(^[0-9]{6}$)/,
fathers_name_split: /([Ss]\/[Oo])[\s:]+|([Dd]\/[Oo])[\s:]+|([Cc]\/[Oo])[\s:]+|([Ww]\/[Oo])[\s:]+/
noise: /(^[\s]+$)|(^[A-Z]{0,2}[.,]+$)|(^[A-Z0-9]{2,}[a-z]+)|(^[A-Z0-9]+[.,]+[A-Z0-9]+)|(^[0-9]+[a-zA-Z]{3,})|(^www\.\w+)|(\.gov\.in*$)/,
address_end: /([A-Z\s]+[a-z]*[,;:._\s-]+[0-9]{6}\.?$)|(^[0-9]{6}\.?$)/,
fathers_name_split: /([Ss]\/[Oo])[\s:]+|([Dd]\/[Oo])[\s:]+|([Cc]\/[Oo])[\s:]+|([Ww]\/[Oo])[\s:]+/,
english: /(^[\w,.:;&*\/|)('"#+^`-]*$)/,
local_language_reverse_prefix: /[^\w\s,.:;&*\/|)('"#+^`-]+.*/,
unwanted_prefix_suffix: /(^[\s.,-]+)|([\s.,-]+$)/g
};

const LINE_MIN_SIZE = 4;
Expand All @@ -44,6 +48,9 @@ const filterNoiseFromLine = (lineText: string) => {
if (AADHAAR_REGEX["noise"].exec(word)) {
return false;
}
if (!AADHAAR_REGEX["english"].exec(word)) {
return false;
}
return true;
});
return _.join(filteredSpacedList, " ");
Expand All @@ -52,8 +59,22 @@ const filterNoiseFromLine = (lineText: string) => {
const removeNoiseFromText = (lines: Array<string>) => {
const filteredLines = [];
_.forEach(lines, line => {
if (_.size(line) >= LINE_MIN_SIZE) {
const filteredText = filterNoiseFromLine(line);
const reverseLine = line
.split("")
.reverse()
.join("");
const reverseLineWithoutLocalLanguage = _.replace(
reverseLine,
AADHAAR_REGEX["local_language_reverse_prefix"],
""
);
const lineWithoutLocalLanguage = reverseLineWithoutLocalLanguage
.split("")
.reverse()
.join("")
.trim();
if (_.size(lineWithoutLocalLanguage) >= LINE_MIN_SIZE) {
const filteredText = filterNoiseFromLine(lineWithoutLocalLanguage);
filteredLines.push(filteredText);
}
});
Expand Down Expand Up @@ -82,29 +103,33 @@ const parseAadhaarHeadingLineNumbers = (lines: Array<string>) => {
} else if (AADHAAR_REGEX["govt"].exec(line)) {
aadhaarHeadingLineNumbers["aadhar_govt_text_line"] = index;
} else if (
!aadhaarHeadingLineNumbers["aadhar_dob_text_line"] &&
aadhaarHeadingLineNumbers["aadhar_dob_text_line"] === undefined &&
AADHAAR_REGEX["dob_heading"].exec(line)
) {
aadhaarHeadingLineNumbers["aadhar_dob_text_line"] = index;
} else if (
!aadhaarHeadingLineNumbers["aadhar_gender_text_line"] &&
aadhaarHeadingLineNumbers["aadhar_gender_text_line"] === undefined &&
AADHAAR_REGEX["gender"].exec(line)
) {
aadhaarHeadingLineNumbers["aadhar_gender_text_line"] = index;
} else if (
!aadhaarHeadingLineNumbers["aadhar_number_text_line"] &&
aadhaarHeadingLineNumbers["aadhar_number_text_line"] === undefined &&
AADHAAR_REGEX["number_format"].exec(line)
) {
aadhaarHeadingLineNumbers["aadhar_number_text_line"] = index;
} else if (
!aadhaarHeadingLineNumbers["aadhar_relative_name_text_line"] &&
aadhaarHeadingLineNumbers["aadhar_relative_name_text_line"] === undefined &&
AADHAAR_REGEX["relative_name_heading"].exec(line)
) {
aadhaarHeadingLineNumbers["aadhar_relative_name_text_line"] = index;
} else if (AADHAAR_REGEX["address_start"].exec(line)) {
} else if (
aadhaarHeadingLineNumbers["aadhar_address_start_line"] === undefined &&
AADHAAR_REGEX["address_start"].exec(line)
) {
aadhaarHeadingLineNumbers["aadhar_address_start_line"] = index;
} else if (
aadhaarHeadingLineNumbers["aadhar_address_start_line"] &&
aadhaarHeadingLineNumbers["aadhar_address_start_line"] !== undefined &&
aadhaarHeadingLineNumbers["aadhar_address_end_line"] === undefined &&
AADHAAR_REGEX["address_end"].exec(line)
) {
aadhaarHeadingLineNumbers["aadhar_address_end_line"] = index;
Expand Down Expand Up @@ -211,9 +236,9 @@ const parseAadhaarNumber = (
const getAddressStartLineTokens = (rawAddressStartText: string) => {
const addressRelevantTokens = _.split(
rawAddressStartText,
AADHAAR_REGEX["address_start"]
AADHAAR_REGEX["address_head"]
);
const addressRelevantString = _.join(_.slice(addressRelevantTokens, 1), "");
const addressRelevantString = _.join(addressRelevantTokens, "");
const addressSplit = _.split(
addressRelevantString,
AADHAAR_REGEX["address_start_split"]
Expand Down Expand Up @@ -292,20 +317,17 @@ const processAadhaarAddress = (
const addressStartSplit = getAddressStartLineTokens(
textLines[addressStartLine]
);
if (_.size(addressStartSplit) > 1) {
const relevantTokens = _.slice(addressStartSplit, 1);
_.forEach(relevantTokens, token => {
_.forEach(addressStartSplit, token => {
if (!_.isEmpty(token)) {
addressLines.push(token);
});
}
}
});
_.forEach(_.range(addressStartLine + 1, addressEndLine), lineNumber => {
addressLines.push(textLines[lineNumber]);
});
const addressEndRelevantText = getAddressEndLineText(
textLines[addressEndLine]
);
addressLines.push(addressEndRelevantText);
return _.join(addressLines, " ");
addressLines.push(textLines[addressEndLine]);
const address = _.join(addressLines, " ");
return _.replace(address, AADHAAR_REGEX["unwanted_prefix_suffix"], "");
};

const parseAadhaarText = (
Expand Down Expand Up @@ -361,12 +383,16 @@ const validateAadhaarText = (
const {
aadhar_number_text_line: aadharNumberTextLine,
aadhar_title_text_line: aadharTitleTextLine,
aadhar_document_text_line: aadharDocumentTextLine
aadhar_document_text_line: aadharDocumentTextLine,
aadhar_address_start_line: aadharAddressStartLine,
aadhar_address_end_line: aadharAddressEndLine
} = aadhaarHeadingLineNumbers;
return (
_.isNumber(aadharNumberTextLine) ||
_.isNumber(aadharTitleTextLine) ||
_.isNumber(aadharDocumentTextLine)
_.isNumber(aadharDocumentTextLine) ||
_.isNumber(aadharAddressStartLine) ||
_.isNumber(aadharAddressEndLine)
);
};

Expand Down
8 changes: 3 additions & 5 deletions src/ocr/google-vision/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,10 @@ Constants.REQUEST_PAYLOAD = {
},
features: [
{
type: "TEXT_DETECTION"
model: "builtin/latest",
type: "DOCUMENT_TEXT_DETECTION"
}
],
imageContext: {
languageHints: ["en"]
}
]
}
]
};
Expand Down

0 comments on commit 1cc9b90

Please sign in to comment.