fix(ZNTA-2247): Remove magic numbers for glossary csv columns (#592)

* fix(ZNTA-2247): Remove magic numbers for glossary csv columns Still expects column 0 to be the source locale, but checks that the PoS and Description columns are named. This prevents garbage in said fields. This change also prevents repeatedly setting these fields, only doing so once per row. * fix(ZNTA-2247): Allow several synonyms for data fields Fix invalid test, contradicting the specified behaviour. * fix(ZNTA-2247): Add docs and improve code as per review
zanata · Jan 11, 2018 · 89a9521 · 89a9521
1 parent 33802a3
commit 89a9521
Show file tree

Hide file tree

Showing 5 changed files with 78 additions and 45 deletions.
diff --git a/.../zanata-adapter-glossary/src/main/java/org/zanata/adapter/glossary/GlossaryCSVReader.java b/.../zanata-adapter-glossary/src/main/java/org/zanata/adapter/glossary/GlossaryCSVReader.java
@@ -22,6 +22,7 @@
 
 import java.io.IOException;
 import java.io.Reader;
+import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -45,9 +46,15 @@
  *
  **/
 public class GlossaryCSVReader extends AbstractGlossaryPushReader {
+
     private final LocaleId srcLang;
     private final static String POS = "POS";
+    private final static String[] POSSYNONYMS =
+            {"POS", "PARTOFSPEECH", "PART OF SPEECH"};
     private final static String DESC = "DESCRIPTION";
+    private final static String[] DESCSYNONYMS =
+            {"DESCRIPTION", "DESC", "DEFINITION"};
+    private final static int TERM = 0;
 
     /**
      * This class will close the reader
@@ -68,7 +75,7 @@ public Map<LocaleId, List<GlossaryEntry>> extractGlossary(Reader reader,
             Map<Integer, LocaleId> localeColMap =
                     setupLocalesMap(records, descriptionMap);
 
-            LocaleId srcLocale = localeColMap.get(0);
+            LocaleId srcLocale = localeColMap.get(TERM);
 
             if (!srcLang.equals(srcLocale)) {
                 throw new RuntimeException("input source language '" + srcLang
@@ -77,20 +84,23 @@ public Map<LocaleId, List<GlossaryEntry>> extractGlossary(Reader reader,
             }
             Map<LocaleId, List<GlossaryEntry>> results = Maps.newHashMap();
 
-            for (int i = 1; i < records.size(); i++) {
-                CSVRecord row = records.get(i);
-                for (int x = 1; x < row.size()
-                    && localeColMap.containsKey(x); x++) {
-
-                    GlossaryEntry entry = new GlossaryEntry();
-                    entry.setSrcLang(srcLocale);
+            for (CSVRecord row : records.subList(1, records.size())) {
+                GlossaryEntry entry = new GlossaryEntry();
+                entry.setSrcLang(srcLocale);
+                if (descriptionMap.containsKey(POS)) {
                     entry.setPos(row.get(descriptionMap.get(POS)));
+                }
+                if (descriptionMap.containsKey(DESC)) {
                     entry.setDescription(row.get(descriptionMap.get(DESC)));
-                    entry.setQualifiedName(new QualifiedName(qualifiedName));
+                }
+                entry.setQualifiedName(new QualifiedName(qualifiedName));
+
+                for (int x = 1; x < row.size()
+                    && localeColMap.containsKey(x); x++) {
 
                     GlossaryTerm srcTerm = new GlossaryTerm();
                     srcTerm.setLocale(srcLocale);
-                    srcTerm.setContent(row.get(0));
+                    srcTerm.setContent(row.get(TERM));
 
                     entry.getGlossaryTerms().add(srcTerm);
 
@@ -100,8 +110,8 @@ public Map<LocaleId, List<GlossaryEntry>> extractGlossary(Reader reader,
                     GlossaryTerm transTerm = new GlossaryTerm();
                     transTerm.setLocale(transLocaleId);
                     transTerm.setContent(transContent);
-
                     entry.getGlossaryTerms().add(transTerm);
+
                     List<GlossaryEntry> list = results.get(transLocaleId);
                     if (list == null) {
                         list = Lists.newArrayList();
@@ -117,8 +127,9 @@ public Map<LocaleId, List<GlossaryEntry>> extractGlossary(Reader reader,
     }
 
     /**
-     * Basic validation of CVS file format - At least 2 rows in the CVS file -
-     * Empty content validation - All row must have the same column count
+     * Basic validation of CSV file format - At least 2 rows in the CSV file -
+     * Empty content validation - All rows must have the same column count.
+     * @param records list of records representing a csv file
      */
     private void validateCSVEntries(@Nonnull List<CSVRecord> records) {
         if (records.isEmpty()) {
@@ -138,33 +149,46 @@ private void validateCSVEntries(@Nonnull List<CSVRecord> records) {
     }
 
     /**
-     * Parser reads from all from first row and exclude column from description
-     * map. Format of CVS: {source locale},{locale},{locale}...,pos,description
+     * Parser reads from all from first row and up to first recognised
+     * non-locale column mapping.
+     * Format of CSV: {source locale},{locale},{locale}...,pos,description
+     * @param records list of records representing a csv file
+     * @param descriptionMap header locations of non-locale columns
      */
     private Map<Integer, LocaleId> setupLocalesMap(List<CSVRecord> records,
             Map<String, Integer> descriptionMap) {
-        Map<Integer, LocaleId> localeColMap = new HashMap<Integer, LocaleId>();
+        Map<Integer, LocaleId> localeColMap = new HashMap<>();
         CSVRecord headerRow = records.get(0);
-        for (int row = 0; row <= headerRow.size()
-                && !descriptionMap.containsValue(row); row++) {
+        for (int column = 0; column < headerRow.size()
+                && !descriptionMap.containsValue(column); column++) {
             LocaleId locale =
-                    new LocaleId(StringUtils.trim(headerRow.get(row)));
-            localeColMap.put(row, locale);
+                    new LocaleId(StringUtils.trim(headerRow.get(column)));
+            localeColMap.put(column, locale);
         }
         return localeColMap;
     }
 
     /**
-     * Read last 2 columns in CSV:
-     * {source locale},{locale},{locale}...,pos,description
+     * Locate trailing data columns in CSV:
+     * {source locale},{locale},{locale}...,pos,description,...
      *
-     * @param records
+     * @param records list of records representing a csv file
      */
     private Map<String, Integer> setupDescMap(List<CSVRecord> records) {
-        Map<String, Integer> descMap = new HashMap<String, Integer>();
+        Map<String, Integer> descMap = new HashMap<>();
         CSVRecord headerRow = records.get(0);
-        descMap.put(POS, headerRow.size() - 2);
-        descMap.put(DESC, headerRow.size() - 1);
+        for (int position = 1; position < headerRow.size(); position++) {
+            String headerItem = headerRow.get(position);
+            if (headerItem.trim().length() == 0) {
+                continue;
+            }
+            if (Arrays.asList(POSSYNONYMS).contains(headerItem.toUpperCase())) {
+                descMap.put(POS, position);
+            } else if (Arrays.asList(DESCSYNONYMS)
+                    .contains(headerItem.toUpperCase())) {
+                descMap.put(DESC, position);
+            }
+        }
         return descMap;
     }
 }
diff --git a/...ata-adapter-glossary/src/test/java/org/zanata/adapter/glossary/GlossaryCSVReaderTest.java b/...ata-adapter-glossary/src/test/java/org/zanata/adapter/glossary/GlossaryCSVReaderTest.java
@@ -57,20 +57,18 @@ public void testNotMatchingSource() throws IOException {
     }
 
     @Test
-    public void extractGlossaryTest1() throws IOException {
-
+    public void extractGlossary() throws IOException {
         GlossaryCSVReader reader = new GlossaryCSVReader(LocaleId.EN_US);
         int entryPerLocale = 2;
 
         File sourceFile =
                 new File("src/test/resources/glossary/translate1.csv");
 
-        Reader inputStreamReader =
-                new InputStreamReader(new FileInputStream(sourceFile), "UTF-8");
-        BufferedReader br = new BufferedReader(inputStreamReader);
+        BufferedReader bufferedReader = new BufferedReader(
+                new InputStreamReader(new FileInputStream(sourceFile), "UTF-8"));
 
         Map<LocaleId, List<GlossaryEntry>> glossaries =
-                reader.extractGlossary(br,
+                reader.extractGlossary(bufferedReader,
                         GlossaryResource.GLOBAL_QUALIFIED_NAME);
 
         assertThat(glossaries.keySet()).hasSize(2);
@@ -79,29 +77,40 @@ public void extractGlossaryTest1() throws IOException {
         for (Map.Entry<LocaleId, List<GlossaryEntry>> entry : glossaries
                 .entrySet()) {
             assertThat(entry.getValue()).hasSize(entryPerLocale);
+            assertEntry(entry);
         }
     }
 
     @Test
-    public void extractGlossaryTest2() throws IOException {
+    public void extractGlossaryWithOtherHeaders() throws IOException {
         GlossaryCSVReader reader = new GlossaryCSVReader(LocaleId.EN_US);
         int entryPerLocale = 2;
 
         File sourceFile =
                 new File("src/test/resources/glossary/translate2.csv");
 
-        Reader inputStreamReader =
-                new InputStreamReader(new FileInputStream(sourceFile), "UTF-8");
-        BufferedReader br = new BufferedReader(inputStreamReader);
+        BufferedReader bufferedReader = new BufferedReader(
+                new InputStreamReader(new FileInputStream(sourceFile), "UTF-8"));
 
-        Map<LocaleId, List<GlossaryEntry>> glossaries =
-                reader.extractGlossary(br, GlossaryResource.GLOBAL_QUALIFIED_NAME);
+        Map<LocaleId, List<GlossaryEntry>> glossaries = reader.extractGlossary(
+                bufferedReader, GlossaryResource.GLOBAL_QUALIFIED_NAME);
 
         assertThat(glossaries.keySet()).hasSize(2);
         assertThat(glossaries.keySet()).contains(LocaleId.ES, new LocaleId("zh"));
+
         for (Map.Entry<LocaleId, List<GlossaryEntry>> entry : glossaries
             .entrySet()) {
             assertThat(entry.getValue()).hasSize(entryPerLocale);
+            assertEntry(entry);
         }
     }
+
+    private void assertEntry(Map.Entry<LocaleId, List<GlossaryEntry>> entry) {
+        assertThat(entry.getValue().get(0).getDescription()
+                .startsWith("testing of hello"));
+        assertThat(entry.getValue().get(1).getDescription()
+                .startsWith("testing of morning"));
+        assertThat(entry.getValue().get(0).getPos()).isEqualTo("verb");
+        assertThat(entry.getValue().get(1).getPos()).isEqualTo("noun");
+    }
 }
diff --git a/common/zanata-adapter-glossary/src/test/resources/glossary/translate1.csv b/common/zanata-adapter-glossary/src/test/resources/glossary/translate1.csv
@@ -1,3 +1,3 @@
 "en-US","es","zh","pos","description"
-"Hello","Hola","您好","noun","testing of hello csv"
+"Hello","Hola","您好","verb","testing of hello csv"
 "Morning","mañana","上午","noun","testing of morning csv"
diff --git a/common/zanata-adapter-glossary/src/test/resources/glossary/translate2.csv b/common/zanata-adapter-glossary/src/test/resources/glossary/translate2.csv
@@ -1,3 +1,3 @@
-"en-US","es","zh","description1","description2"
-"Hello","Hola","您好","testing of hello csv1","testing of hello csv2"
-"Morning","mañana","上午","testing of morning csv1","testing of morning csv2"
+"en-US","es","zh","definition","synonyms","partofspeech"
+"Hello","Hola","您好","testing of hello csv","Hi","verb"
+"Morning","mañana","上午","testing of morning csv","Dawn,AM","noun"
diff --git a/docs/user-guide/glossary/upload-glossaries.md b/docs/user-guide/glossary/upload-glossaries.md
@@ -10,8 +10,8 @@ See [glossary roles and permission](/user-guide/glossary/glossary-roles-permissi
 ##### header row
 
 1. locale columns: The locale header column must contain a valid locale code. At least one locale code is required for the glossary. First locale column will be used as source.
-1. part-of-speech column (optional): The part-of-speech header column, when included, should contain only one value: pos.
-1. description column (optional): The description header column, when included, should contain only one value: description.
+1. part-of-speech column (optional): The part-of-speech header column, when included, should be only one of: `pos`, `partofspeech` or `part of speech`.
+1. description column (optional): The description header column, when included, should be only one of: `desc`, `description` or `definition` .
 
 
 ##### data rows
@@ -27,7 +27,7 @@ See [glossary roles and permission](/user-guide/glossary/glossary-roles-permissi
 ### Upload via Web UI
 
 1. Login into Zanata
-1. To upload to the **system glossary**, click `Glossary` menu. 
+1. To upload to the **system glossary**, click `Glossary` menu.
 1. To upload to a **project glossary**, navigate to the project page, click on "Glossary" in the project page.
 1. Click on `Import Glossary` on top right corner of the page.
 <figure>