<a href="https://colab.research.google.com/github/tahira4/Project1-File-Processing-System/blob/main/Project_1_File-Processing-System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project_1: File-Processing-System

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
%%writefile word_count.c

#include <stdio.h> // Standard I/O functions
#include <stdlib.h> // Standard library functions
#include <string.h> // String handling functions
#include <pthread.h> // POSIX threads for multithreading
#include <sys/wait.h> // Wait for process termination
#include <sys/types.h> // Type definitions for various system calls
#include <sys/mman.h> // Memory management declarations
#include <sys/resource.h> // Resource usage definitions
#include <unistd.h> // Standard symbolic constants and types
#include <time.h> // Time handling functions
#include <stdbool.h> // Boolean data type

// Constants to define the number of files and constraints for word size and count
#define NUM_FILES 7
#define MAX_WORD_LENGTH 100
#define MAX_WORDS 10000

// Struct to hold file path and target word for each file
typedef struct {
    const char *filepath;
    const char *target_word;
} FileWordCount;

// Struct to manage shared data for word counting across processes and threads
typedef struct {
    int word_count[NUM_FILES];  // Holds word counts per file
    int total_word_count[MAX_WORDS];  // Total word counts for each unique word across files
    char words[MAX_WORDS][MAX_WORD_LENGTH];  // Array of unique words encountered
    int unique_words;  // Number of unique words
} SharedData;

// Array of files to be processed with respective target words
FileWordCount file_wc[NUM_FILES] = {
    {"/content/drive/MyDrive/Colab Notebooks/calgary/progp", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/progl", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/progc", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/trans", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/paper1", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/paper2", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/bib", "the"}
};

// Pointer to shared data accessible by multiple threads
SharedData *shared_data;

// Function to count occurrences of a word in a given file
void count_word_in_file(FileWordCount *file_wc, SharedData *shared_data, int index) {
    FILE *file = fopen(file_wc->filepath, "r");  // Open file for reading
    if (file == NULL) {  // Check if file opened successfully
        perror("Failed to open file");
        return;
    }

    char buffer[1024];  // Buffer to hold each word read
    int count = 0;
    while (fscanf(file, "%1023s", buffer) == 1) {  // Read each word up to buffer size
        if (strcmp(buffer, file_wc->target_word) == 0) {  // Check if word matches target word
            count++;
        }
        bool found = false;  // Track if word is already counted
        for (int i = 0; i < shared_data->unique_words; i++) {
            if (strcmp(shared_data->words[i], buffer) == 0) {
                shared_data->total_word_count[i]++;
                found = true;
                break;
            }
        }
        if (!found && shared_data->unique_words < MAX_WORDS) {  // Add new unique word
            strcpy(shared_data->words[shared_data->unique_words], buffer);
            shared_data->total_word_count[shared_data->unique_words] = 1;
            shared_data->unique_words++;
        }
    }
    fclose(file);  // Close file after reading
    shared_data->word_count[index] = count;  // Save count for the current file
    printf("File: %s, Total count of '%s': %d\n", file_wc->filepath, file_wc->target_word, shared_data->word_count[index]);
}

// Struct to store each unique word and its frequency for sorting
typedef struct {
    char word[MAX_WORD_LENGTH];
    int frequency;
} WordFrequency;

// Comparator function for sorting word frequencies
int compare(const void *a, const void *b) {
    return ((WordFrequency *)b)->frequency - ((WordFrequency *)a)->frequency;
}

// Thread function to process a single file
void* thread_function(void* arg) {
    int index = *(int*)arg;
    count_word_in_file(&file_wc[index], shared_data, index);
    return NULL;
}

// Function to execute multithreading for all files
void multithreaded_execution(const char* target_word) {
    pthread_t threads[NUM_FILES];
    int thread_indices[NUM_FILES];
    shared_data = (SharedData*) malloc(sizeof(SharedData));  // Allocate memory for shared data
    shared_data->unique_words = 0;

    for (int i = 0; i < NUM_FILES; i++) {
        thread_indices[i] = i;
        if (pthread_create(&threads[i], NULL, thread_function, &thread_indices[i]) != 0) {
            perror("Failed to create thread");
        }
    }

    for (int i = 0; i < NUM_FILES; i++) {
        pthread_join(threads[i], NULL);
    }

    WordFrequency word_freqs[MAX_WORDS];
    for (int i = 0; i < shared_data->unique_words; i++) {
        strcpy(word_freqs[i].word, shared_data->words[i]);
        word_freqs[i].frequency = shared_data->total_word_count[i];
    }

    qsort(word_freqs, shared_data->unique_words, sizeof(WordFrequency), compare);

    printf("\nTop 50 Most Frequent Words:\n");
    for (int i = 0; i < (shared_data->unique_words < 50 ? shared_data->unique_words : 50); i++) {
        printf("%s: %d\n", word_freqs[i].word, word_freqs[i].frequency);
    }

    free(shared_data);  // Free memory after usage
}

// Function to execute multiprocessing using shared memory
void multiprocess_with_shared_memory(const char *target_word) {
    shared_data = mmap(NULL, sizeof(SharedData), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANONYMOUS, -1, 0);
    if (shared_data == MAP_FAILED) {
        perror("mmap");
        exit(1);
    }
    shared_data->unique_words = 0;

    for (int i = 0; i < NUM_FILES; i++) {
        pid_t pid = fork();
        if (pid == 0) {  // Child process
            count_word_in_file(&file_wc[i], shared_data, i);
            exit(0);
        } else if (pid < 0) {  // Error handling for fork failure
            perror("fork");
            exit(1);
        }
    }

    for (int i = 0; i < NUM_FILES; i++) {
        wait(NULL);
    }

    WordFrequency word_freqs[MAX_WORDS];
    for (int i = 0; i < shared_data->unique_words; i++) {
        strcpy(word_freqs[i].word, shared_data->words[i]);
        word_freqs[i].frequency = shared_data->total_word_count[i];
    }

    qsort(word_freqs, shared_data->unique_words, sizeof(WordFrequency), compare);

    printf("\nTop 50 Most Frequent Words:\n");
    for (int i = 0; i < (shared_data->unique_words < 50 ? shared_data->unique_words : 50); i++) {
        printf("%s: %d\n", word_freqs[i].word, word_freqs[i].frequency);
    }

    if (munmap(shared_data, sizeof(SharedData)) == -1) {
        perror("munmap");
    }
}

// Function to measure performance by tracking CPU time, memory usage, and elapsed time
void measure_performance(void (*func)(const char *), const char *target_word, const char *description) {
    struct rusage usage;
    clock_t start, end;
    double cpu_time_used, elapsed_time;

    start = clock();
    func(target_word);
    end = clock();

    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
    elapsed_time = cpu_time_used;

    if (getrusage(RUSAGE_SELF, &usage) == 0) {
        double user_cpu_time = usage.ru_utime.tv_sec + usage.ru_utime.tv_usec / 1000000.0;
        double sys_cpu_time = usage.ru_stime.tv_sec + usage.ru_stime.tv_usec / 1000000.0;
        cpu_time_used = user_cpu_time + sys_cpu_time;
        double cpu_usage_percent = (cpu_time_used / elapsed_time) * 100.0;
        long memory_usage_kb = usage.ru_maxrss;

        printf("Mode: %s \nCPU Time: %f seconds \nCPU Usage: %.2f%% \nMemory Usage: %ld KB\n",
               description, cpu_time_used, cpu_usage_percent, memory_usage_kb);


    } else {
        printf("Mode: %s, CPU Time: %f seconds\n", description, cpu_time_used);
        perror("getrusage");
    }
}

int main() {
    const char *target_word = "the";  // Word to count

    printf("Multiprocessing with Shared Memory ===\n");
    measure_performance(multiprocess_with_shared_memory, target_word, "Multiprocessing with Shared Memory");

    printf("\nMultithreading Execution ===\n");
    measure_performance(multithreaded_execution, target_word, "Multithreading Execution");

    return 0;
}


Overwriting word_count.c


In [13]:
# prompt: gcc %%writefile word_count.c
!gcc word_count.c -o word_count -pthread
!./word_count


Multiprocessing with Shared Memory ===
File: /content/drive/MyDrive/Colab Notebooks/calgary/progp, Total count of 'the': 0
File: /content/drive/MyDrive/Colab Notebooks/calgary/progc, Total count of 'the': 89
File: /content/drive/MyDrive/Colab Notebooks/calgary/progl, Total count of 'the': 46
File: /content/drive/MyDrive/Colab Notebooks/calgary/paper1, Total count of 'the': 434
File: /content/drive/MyDrive/Colab Notebooks/calgary/trans, Total count of 'the': 108
File: /content/drive/MyDrive/Colab Notebooks/calgary/paper2, Total count of 'the': 757
File: /content/drive/MyDrive/Colab Notebooks/calgary/bib, Total count of 'the': 146

Top 50 Most Frequent Words:
the: 1580
of: 1199
%A: 1196
and: 885
to: 783
a: 782
%T: 726
%D: 702
*: 670
in: 638
;: 559
is: 522
for: 465
%J: 457
%P: 443
%K: 432
if: 412
=: 396
%O: 371
be: 332
it: 319
%V: 304
that: 277
%I: 264
on: 262
begin: 258
%C: 255
The: 253
then: 244
with: 218
%N: 211
*/: 198
/*: 197
as: 189
by: 189
else: 187
.sp: 180
end;: 180
this: 169
an: