In [32]:
# Define the documents as a dictionary where keys are document titles and values are document content
documents = {
    "Article: Pandas Basics": "This article covers the basics of using Pandas in Python.",
    "Tutorial: Data Visualization": "Learn about data visualization techniques with Python libraries.",
    "Case Study: Sales Analysis": "Analyze sales data using Python for a real-world case study."
}

In [33]:
# get the keys of the documents and store it in the list
key_documents = [key for key in documents ]
# display the list of the key documents
print(key_documents)

['Article: Pandas Basics', 'Tutorial: Data Visualization', 'Case Study: Sales Analysis']


In [43]:
# Extract the category of the key documents and store it 
categories = []
for key_doc in documents:
    category = key_doc.split(': ')[0]
    categories.append(category)
    

print(categories)

['Article', 'Tutorial', 'Case Study']


In [35]:

# Extract the content values and put them in a list
document_content_list = list(documents.values())
# Display the list of document content
print(document_content_list)

['This article covers the basics of using Pandas in Python.', 'Learn about data visualization techniques with Python libraries.', 'Analyze sales data using Python for a real-world case study.']


In [36]:
# Gather the set of all unique terms from the list content of document and diplay the result
#unique_terms = {term for doc_content in document_content_list for term in doc_content.split()}
unique_terms = set()
for doc_content in document_content_list:
    terms = doc_content.split()
    unique_terms.update(terms)
print(unique_terms)

{'about', 'Analyze', 'covers', 'Learn', 'for', 'real-world', 'article', 'with', 'a', 'case', 'basics', 'the', 'Python', 'visualization', 'study.', 'in', 'sales', 'Pandas', 'Python.', 'This', 'of', 'techniques', 'data', 'using', 'libraries.'}


In [37]:
# create a matrix for the document by using the unique term
doc_term_matrix = {}

for term in unique_terms:
    doc_term_matrix[term] = []

    for doc_content in document_content_list:
        if term in doc_content:
            doc_term_matrix[term].append(1)
        else:
            doc_term_matrix[term].append(0)
doc_term_matrix

{'about': [0, 1, 0],
 'Analyze': [0, 0, 1],
 'covers': [1, 0, 0],
 'Learn': [0, 1, 0],
 'for': [0, 0, 1],
 'real-world': [0, 0, 1],
 'article': [1, 0, 0],
 'with': [0, 1, 0],
 'a': [1, 1, 1],
 'case': [0, 0, 1],
 'basics': [1, 0, 0],
 'the': [1, 0, 0],
 'Python': [1, 1, 1],
 'visualization': [0, 1, 0],
 'study.': [0, 0, 1],
 'in': [1, 0, 1],
 'sales': [0, 0, 1],
 'Pandas': [1, 0, 0],
 'Python.': [1, 0, 0],
 'This': [1, 0, 0],
 'of': [1, 0, 0],
 'techniques': [0, 1, 0],
 'data': [0, 1, 1],
 'using': [1, 0, 1],
 'libraries.': [0, 1, 0]}

In [45]:
# import the numpy library if it doesn't work you need to install numpy
import numpy as np

docs_array = np.array(document_content_list, dtype='object')

v1 = np.array(doc_term_matrix['the'])    
v2 = np.array(doc_term_matrix['Python'])

print(v1)
print(v2)
print('-------')

# find the documents that have both terms from v1 and v2
v3 = v1 & v2

print(v3)

# display the content document from the result
...
[doc for doc in v3 * docs_array if doc]

[1 0 0]
[1 1 1]
-------
[1 0 0]


['This article covers the basics of using Pandas in Python.']

In [39]:
# find the document for those have at least one word

import numpy as np

docs_array = np.array(document_content_list, dtype='object')

v1 = np.array(doc_term_matrix['the'])    
v2 = np.array(doc_term_matrix['Python'])

print(v1)
print(v2)
print('-------')

# find the documents that have both terms from v1 and v2
v3 = v1 | v2

print(v3)

# display the content document from the result
...
[doc for doc in v3 * docs_array if doc]

[1 0 0]
[1 1 1]
-------
[1 1 1]


['This article covers the basics of using Pandas in Python.',
 'Learn about data visualization techniques with Python libraries.',
 'Analyze sales data using Python for a real-world case study.']

In [112]:
import pandas as pd
from tabulate import tabulate

def print_csv_as_table(csv_file_path):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file_path, header=None, names=['Documents', 'Contents'])

    # Set the maximum column width to a large value to prevent truncation
    pd.set_option('display.max_colwidth', None)

    # Set the column headers to be left-aligned
    pd.set_option('display.colheader_justify', 'center')

    # Print the DataFrame as a table
    table = tabulate(df, headers='keys', tablefmt='grid')
    print(table)

# Example usage:
csv_file_path = '/Users/vandaragnep/Downloads/tdoc.csv'
print_csv_as_table(csv_file_path)



+----+-------------+--------------------------------------------------------------+
|    | Documents   | Contents                                                     |
|  0 | Document 1  | This is the first document. It contains some words.          |
+----+-------------+--------------------------------------------------------------+
|  1 | Document 2  | The second document is a bit longer and has different words. |
+----+-------------+--------------------------------------------------------------+
|  2 | Document 3  | The third document is short. Short documents can be concise. |
+----+-------------+--------------------------------------------------------------+


In [116]:
import csv
unique_terms = set()
for doc_content in document_content_list:
    terms = doc_content.split()
    unique_terms.update(terms)
with open('unique_terms.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow(unique_terms)