# Vectorization

In [1]:
__author__ = "Christine Mendoza, \
    with some sqlite3-related code from Dr. Gary Bishop's / Dr. John Majikes' UNC Chapel Hill COMP421 (Databases) class"

Before going further, make sure to define your file paths.

In [None]:
DB_PATH: str = "./analysis.db"

Set up your database connection.

In [None]:
import sqlite3
db = sqlite3.Connection(DB_PATH)
cursor = db.cursor()

Create one last table for marking papers which have been completed.

In [None]:
LIST_TABLES = """
    SELECT name
        FROM sqlite_master
        WHERE type='table' AND
                name NOT LIKE 'sqlite_%'
"""
LIST_SCHEMAS = """
    SELECT sql
        FROM sqlite_master
        WHERE type='table' AND
                name NOT LIKE 'sqlite_%'
"""
NODES = """
    SELECT node_id, word FROM Nodes;
"""
QUERY_LIST_ALL_EDGES = """
    WITH Source_Text_Info AS (SELECT node_id, word FROM Nodes),
        Target_Text_Info AS (SELECT node_id, word FROM Nodes)
    SELECT E.edge_id, E.source_node, S.word, E.target_node, T.word, E.edge_type
        FROM Edges E, Source_Text_Info S, Target_Text_Info T
        WHERE E.source_node = S.node_id
            AND E.target_node = T.node_id
        ORDER BY source_node;
"""
QUERY_LIST_ALL_EDGES_NO_ID = """
    WITH Source_Text_Info AS (SELECT node_id, word FROM Nodes),
        Target_Text_Info AS (SELECT node_id, word FROM Nodes)
    SELECT S.word, T.word
        FROM Edges E, Source_Text_Info S, Target_Text_Info T
        WHERE E.source_node = S.node_id
            AND E.target_node = T.node_id
        ORDER BY source_node;
"""
EDGES = """
    WITH Source_Text_Info AS (SELECT node_id, word FROM Nodes),
        Target_Text_Info AS (SELECT node_id, word FROM Nodes)
    SELECT S.word, T.word, Sc.Title
        FROM Edges E, Source_Text_Info S, Target_Text_Info T, Literature_Sources_Edges L, Scopus_Info Sc
        WHERE E.source_node = S.node_id
            AND E.target_node = T.node_id
            AND E.edge_id = L.edge_id
            AND L.paper_id = Sc.paper_id
        ORDER BY E.source_node;
"""
ARTICLES = """
    SELECT No.word, Sc.Title
        FROM Scopus_Info Sc, Literature_Sources_Nodes L, Nodes No
        WHERE Sc.paper_id = L.paper_id
            AND L.node_id = No.node_id
        ORDER BY No.node_id;
"""
NUM_SOURCES_NODES = """
    SELECT No.node_id, No.word, COUNT(L.paper_id) AS source_count
        FROM Literature_Sources_Nodes L, Nodes No
        WHERE L.node_id = No.node_id
        GROUP BY No.node_id
        ORDER BY source_count DESC, No.word;
"""
NUM_LITERATURE_SOURCES_NODES = """
    SELECT COUNT(*) FROM Literature_Sources_Nodes;
"""
NUM_LITERATURE_SOURCES_EDGES = """
    SELECT COUNT(*) FROM Literature_Sources_Edges;
"""
NUM_NODES = """
    SELECT COUNT(*) FROM Nodes;
"""
NUM_EDGES = """
    SELECT COUNT(*) FROM Edges;
"""
TABLES = """
    SELECT *
        FROM sqlite_master
        WHERE type='table' AND
            name NOT LIKE 'sqlite_%'
"""
FOCUS_POPULATION = """
    SELECT Re.Focus_Population, No.word
        FROM Review_Notes Re, Nodes No, Literature_Sources_Nodes Li
        WHERE Re.paper_id = Li.paper_id
            AND No.node_id = Li.node_id
            AND Re.Focus_Population like '%visual%';
"""
COUNTRY = """
    SELECT Re.Study_Country_ies, No.word
        FROM Review_Notes Re, Nodes No, Literature_Sources_Nodes Li
        WHERE Re.paper_id = Li.paper_id
            AND No.node_id = Li.node_id
            AND Re.Study_Country_ies like '%India%';
"""

QUERY_UPDATE_REVIEW_STATUS = """
    UPDATE Completed
    SET completed = ?
    WHERE paper_id = ?;
"""

SPECIFIC_ARTICLES = """
    SELECT S.paper_id
        FROM Review_Notes S WHERE
        S.Focus_Location like'%education%' OR
        S.Focus_Location like'%school%' OR
        S.Focus_Location like'%university%' OR
        S.Focus_Location like'%Higher Ed%';
"""

SPECIFIC_ARTICLES_NUM_SOURCES = """
    WITH Specific_Articles AS (SELECT Re.paper_id
                                FROM Review_Notes Re WHERE
                                Re.Focus_Location not like'%indoor %' AND
                                Re.Focus_Location not like'%building%')
    SELECT No.node_id, No.word, COUNT(L.paper_id) AS source_count
        FROM Literature_Sources_Nodes L, Nodes No, Specific_Articles Sp
        WHERE L.node_id = No.node_id
            AND L.paper_id = Sp.paper_id
        GROUP BY No.word
        ORDER BY source_count DESC, No.word;
"""
SPECIFIC_ARTICLES_NUM_SOURCES_NO_ID = """
    WITH Specific_Articles AS (SELECT Re.paper_id
                                FROM Review_Notes Re WHERE
                                Re.Focus_Location like'%indoor %' OR
                                Re.Focus_Location like'%building%')
    SELECT No.word, COUNT(L.paper_id) AS source_count
        FROM Literature_Sources_Nodes L, Nodes No, Specific_Articles Sp
        WHERE L.node_id = No.node_id
            AND L.paper_id = Sp.paper_id
        GROUP BY No.word
        ORDER BY source_count DESC, No.word;
"""
SPECIFIC_ARTICLES_NUM_LITERATURE_SOURCES_NODES = """
    WITH Specific_Articles AS (SELECT Re.paper_id
                                FROM Review_Notes Re WHERE
                                Re.Focus_Location not like'%indoor %' AND
                                Re.Focus_Location not like'%building%')
    SELECT COUNT(*) FROM Literature_Sources_Nodes L, Specific_Articles Sp
        WHERE L.paper_id = Sp.paper_id;
"""
SPECIFIC_ARTICLES_NUM_LITERATURE_SOURCES_EDGES = """
    WITH Specific_Articles AS (SELECT Re.paper_id
                                FROM Review_Notes Re WHERE
                                Re.Focus_Location not like'%indoor %' AND
                                Re.Focus_Location not like'%building%')
    SELECT COUNT(*) FROM Literature_Sources_Edges L, Specific_Articles Sp
        WHERE L.paper_id = Sp.paper_id;
"""

SPECIFIC_ARTICLES_EDGES = """
    WITH Source_Text_Info AS (SELECT node_id, word FROM Nodes),
        Target_Text_Info AS (SELECT node_id, word FROM Nodes),
        Specific_Articles AS (SELECT Re.paper_id
                                FROM Review_Notes Re WHERE
                                Re.Focus_Location not like'%indoor %' AND
                                Re.Focus_Location not like'%building%')
    SELECT E.edge_id, E.source_node, S.word, E.target_node, T.word, E.edge_type
        FROM Edges E, Source_Text_Info S, Target_Text_Info T, Literature_Sources_Nodes L, Scopus_Info Sc, Specific_Articles Sp
        WHERE E.source_node = S.node_id
            AND E.target_node = T.node_id
            AND E.edge_id = L.node_id
            AND L.paper_id = Sc.paper_id
            AND Sp.paper_id = Sc.paper_id
        ORDER BY E.source_node;"""

EDGES_SOURCE_COUNT_AGGREGATED = """
    WITH Target_Nodes AS (SELECT No.node_id, No.word, COUNT(L.paper_id) AS source_count
                            FROM Literature_Sources_Nodes L, Nodes No
                            WHERE L.node_id = No.node_id
                            GROUP BY No.word
                            ORDER BY source_count DESC, No.word),
    All_Nodes AS (SELECT node_id, word FROM Nodes)
    SELECT A.word, T.word, E.edge_type, T.source_count
        FROM Edges E, Target_Nodes T, All_Nodes A
        WHERE E.target_node = T.node_id
            AND E.source_node = A.node_id
            AND E.target_node = T.node_id
        ORDER BY T.source_count DESC, T.word, A.word;
"""
EDGES_SOURCE_COUNT = """
    WITH Target_Nodes AS (SELECT No.node_id, No.word, COUNT(L.paper_id) AS source_count
                            FROM Literature_Sources_Nodes L, Nodes No
                            WHERE L.node_id = No.node_id
                            GROUP BY No.node_id
                            ORDER BY source_count DESC, No.word),
    All_Nodes AS (SELECT node_id, word FROM Nodes)
    SELECT A.word, A.node_id, T.word, T.node_id, E.edge_type, T.source_count
        FROM Edges E, Target_Nodes T, All_Nodes A
        WHERE E.target_node = T.node_id
            AND E.source_node = A.node_id
            AND E.target_node = T.node_id
        ORDER BY T.source_count DESC, T.word, A.word;
"""
SPECIFIC_ARTICLES_EDGES_SOURCE_COUNT = """
    WITH Specific_Articles AS (SELECT Re.paper_id
                                FROM Review_Notes Re WHERE
                                Re.Focus_Location not like '%indoor %' AND
                                Re.Focus_Location not like '%building%'),
    Target_Nodes AS (SELECT No.node_id, No.word, COUNT(L.paper_id) AS source_count
                        FROM Literature_Sources_Nodes L, Nodes No, Specific_Articles Sp
                        WHERE L.node_id = No.node_id
                            AND L.paper_id = Sp.paper_id
                        GROUP BY No.node_id
                        ORDER BY source_count DESC, No.word),
    All_Nodes AS (SELECT node_id, word FROM Nodes)
    SELECT A.word, A.node_id, T.word, T.node_id, E.edge_type, T.source_count
        FROM Edges E, Target_Nodes T, All_Nodes A
        WHERE E.target_node = T.node_id
            AND E.source_node = A.node_id
            AND E.target_node = T.node_id
        ORDER BY T.source_count DESC, T.word, A.word;
"""
SPECIFIC_WORDS_EDGES_SOURCE_COUNT = """
    WITH Target_Nodes AS (SELECT No.node_id, No.word, COUNT(L.paper_id) AS source_count
                        FROM Literature_Sources_Nodes L, Nodes No
                        WHERE L.node_id = No.node_id
                        GROUP BY No.node_id
                        ORDER BY source_count DESC, No.word),
    All_Nodes AS (SELECT node_id, word FROM Nodes)
    SELECT A.word, A.node_id, T.word, T.node_id, E.edge_type, T.source_count
        FROM Edges E, Target_Nodes T, All_Nodes A
        WHERE E.target_node = T.node_id
            AND E.source_node = A.node_id
            AND E.target_node = T.node_id
            AND ((A.word like 'door%'
            OR A.word like '%entrance%'
            OR A.word like '%exit%'
            OR A.word like '%frontage%'
            OR A.word like '%hardware%') OR
            (T.word like 'door%'
            OR T.word like '%entrance%'
            OR T.word like '%exit%'
            OR T.word like '%frontage%'
            OR T.word like '%hardware%'))
        ORDER BY T.source_count DESC, T.word, A.word;
"""

FIND_DUPLICATE_NODES = """
    SELECT node_id, word FROM Nodes
        GROUP BY word
        ORDER BY word;
"""

<sqlite3.Cursor at 0x114466030>

In [None]:
# Find the columns!
DISTINCT_WORDS = """
    SELECT DISTINCT word FROM Nodes;
"""
cursor.execute(DISTINCT_WORDS)
result = cursor.fetchall()
print(result)
all_words = [record[0] for record in result]
all_words.remove("ROOT")
print(all_words)



In [None]:
template_dict = {
    "article_id": 0
}
for word in all_words:
    template_dict[word] = 0
print(template_dict)



In [None]:
ARTICLES = """
    SELECT Sc.paper_id FROM Scopus_Info SC;
"""
cursor.execute(ARTICLES)
result = cursor.fetchall()
article_ids = [record[0] for record in result]
print(article_ids)

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86]


In [None]:
ARTICLES = """
    SELECT No.word, Sc.Title
        FROM Scopus_Info Sc, Literature_Sources_Nodes L, Nodes No
        WHERE Sc.paper_id = L.paper_id
            AND L.node_id = No.node_id
        ORDER BY No.node_id;
"""
# article_id = 75
# cursor.execute('''
#         SELECT No.word FROM Literature_Sources_Nodes L, Nodes No
#                WHERE L.node_id = No.node_id
#                     AND CAST(L.paper_id AS TEXT) like :id_match;''',
#                {"id_match": article_id})
# rows = cursor.fetchall()
# selected_words = [record[0] for record in rows]
# print(selected_words)

csv_rows = []
for article_id in article_ids:
    csv_row = template_dict.copy()
    csv_row["article_id"] = article_id
    print(csv_row["article_id"])
    # check which words appear in this article
    cursor.execute('''
        SELECT No.word FROM Literature_Sources_Nodes L, Nodes No
               WHERE L.node_id = No.node_id
                    AND CAST(L.paper_id AS TEXT) like :id_match;''',
               {"id_match": article_id})
    rows = cursor.fetchall()
    selected_words = [record[0] for record in rows]
    print(selected_words)

    # change their records to 1 in csv_row
    for word in selected_words:
        csv_row[word] = 1

    print(csv_row)
    csv_rows.append(csv_row)

1
['quality of life', 'independence', 'social participation', 'well-being', 'variety', 'accessibility', 'connectivity', 'predictability', 'clarity', 'spaciousness', 'escape possibilities', 'spaciousness', 'interactivity', 'homelike', 'inclusivity', 'homelike and robust materials', 'personalizability', 'comfort', 'visual calm', 'escape space', 'sensory room', 'good acoustics', 'stimulus zones', 'lighting', 'ventilation', 'safety']
2
['urban', 'pedestrian', 'traffic', 'sidewalk', 'crosswalk', 'crossing', 'signals']
3
['pedestrian', 'path', 'width', 'regularity', 'slope', 'material', 'irregular joins', 'gravel', 'sand', 'clay']
4
['points of interest', 'social', 'exercise', 'appointments', 'shopping', 'green space', 'garden', 'tree', 'street', 'traffic', 'noise level', 'purpose', 'lanes', 'speed limit', 'structure', 'sidewalk', 'median', 'simple juncture', 'roundabout', 'buffer zone between road and sidewalk', 'parking lot', 'open time', 'close time', 'peak hours', 'public transportation'

In [None]:
import csv

with open("output.csv", "a") as file:
    writer = csv.DictWriter(file, fieldnames = ["article_id"] + all_words)
    writer.writeheader()
    writer.writerows(csv_rows)