<a href="https://colab.research.google.com/github/tahira4/Project1-File-Processing-System/blob/main/Project_1_File-Processing-System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project_1: File-Processing-System

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%writefile word_count.c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <sys/wait.h>
#include <sys/types.h>
#include <sys/mman.h>
#include <sys/resource.h>
#include <unistd.h>
#include <time.h>
#include <stdbool.h>

#define NUM_FILES 7  // Number of files to process
#define MAX_WORD_LENGTH 100  // Maximum length of a word
#define MAX_WORDS 10000  // Maximum number of unique words for histogram

typedef struct {
    const char *filepath;  // File path for the file to be processed
    const char *target_word;  // The word we are counting in each file
} FileWordCount;

typedef struct {
    int word_count[NUM_FILES];  // Array to hold word counts for each file
    int total_word_count[MAX_WORDS];  // Array to hold total word counts across files
    char words[MAX_WORDS][MAX_WORD_LENGTH];  // Array to hold words
    int unique_words;  // Number of unique words
} SharedData;

FileWordCount file_wc[NUM_FILES] = {
    {"/content/drive/MyDrive/Colab Notebooks/calgary/progp", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/progl", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/progc", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/trans", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/paper1", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/paper2", "the"},
    {"/content/drive/MyDrive/Colab Notebooks/calgary/bib", "the"}
};

// Shared data pointer for multithreading
SharedData *shared_data;

void count_word_in_file(FileWordCount *file_wc, SharedData *shared_data, int index) {
    FILE *file = fopen(file_wc->filepath, "r");
    if (file == NULL) {
        perror("Failed to open file");
        return;
    }

    char buffer[1024];
    int count = 0;
    while (fscanf(file, "%1023s", buffer) == 1) {
        if (strcmp(buffer, file_wc->target_word) == 0) {
            count++;
        }
        bool found = false;
        for (int i = 0; i < shared_data->unique_words; i++) {
            if (strcmp(shared_data->words[i], buffer) == 0) {
                shared_data->total_word_count[i]++;
                found = true;
                break;
            }
        }
        if (!found && shared_data->unique_words < MAX_WORDS) {
            strcpy(shared_data->words[shared_data->unique_words], buffer);
            shared_data->total_word_count[shared_data->unique_words] = 1;
            shared_data->unique_words++;
        }
    }
    fclose(file);
    shared_data->word_count[index] = count;
    printf("File: %s, Total count of '%s': %d\n", file_wc->filepath, file_wc->target_word, shared_data->word_count[index]);
}

typedef struct {
    char word[MAX_WORD_LENGTH];
    int frequency;
} WordFrequency;

int compare(const void *a, const void *b) {
    return ((WordFrequency *)b)->frequency - ((WordFrequency *)a)->frequency;
}

void* thread_function(void* arg) {
    int index = *(int*)arg;
    count_word_in_file(&file_wc[index], shared_data, index);
    return NULL;
}

void multithreaded_execution(const char* target_word) {
    pthread_t threads[NUM_FILES];
    int thread_indices[NUM_FILES];
    shared_data = (SharedData*) malloc(sizeof(SharedData));
    shared_data->unique_words = 0;

    for (int i = 0; i < NUM_FILES; i++) {
        thread_indices[i] = i;
        if (pthread_create(&threads[i], NULL, thread_function, &thread_indices[i]) != 0) {
            perror("Failed to create thread");
        }
    }

    for (int i = 0; i < NUM_FILES; i++) {
        pthread_join(threads[i], NULL);
    }

    WordFrequency word_freqs[MAX_WORDS];
    for (int i = 0; i < shared_data->unique_words; i++) {
        strcpy(word_freqs[i].word, shared_data->words[i]);
        word_freqs[i].frequency = shared_data->total_word_count[i];
    }

    qsort(word_freqs, shared_data->unique_words, sizeof(WordFrequency), compare);

    printf("\nTop 50 Most Frequent Words:\n");
    for (int i = 0; i < (shared_data->unique_words < 50 ? shared_data->unique_words : 50); i++) {
        printf("%s: %d\n", word_freqs[i].word, word_freqs[i].frequency);
    }

    free(shared_data);
}

void multiprocess_with_shared_memory(const char *target_word) {
    shared_data = mmap(NULL, sizeof(SharedData), PROT_READ | PROT_WRITE,
                                    MAP_SHARED | MAP_ANONYMOUS, -1, 0);
    if (shared_data == MAP_FAILED) {
        perror("mmap");
        exit(1);
    }
    shared_data->unique_words = 0;

    for (int i = 0; i < NUM_FILES; i++) {
        pid_t pid = fork();
        if (pid == 0) {
            count_word_in_file(&file_wc[i], shared_data, i);
            exit(0);
        } else if (pid < 0) {
            perror("fork");
            exit(1);
        }
    }

    for (int i = 0; i < NUM_FILES; i++) {
        wait(NULL);
    }

    WordFrequency word_freqs[MAX_WORDS];
    for (int i = 0; i < shared_data->unique_words; i++) {
        strcpy(word_freqs[i].word, shared_data->words[i]);
        word_freqs[i].frequency = shared_data->total_word_count[i];
    }

    qsort(word_freqs, shared_data->unique_words, sizeof(WordFrequency), compare);

    printf("\nTop 50 Most Frequent Words:\n");
    for (int i = 0; i < (shared_data->unique_words < 50 ? shared_data->unique_words : 50); i++) {
        printf("%s: %d\n", word_freqs[i].word, word_freqs[i].frequency);
    }

    if (munmap(shared_data, sizeof(SharedData)) == -1) {
        perror("munmap");
    }
}

void measure_performance(void (*func)(const char *), const char *target_word, const char *description) {
    struct rusage usage;
    clock_t start, end;
    double cpu_time_used;

    start = clock();
    func(target_word);
    end = clock();

    cpu_time_used = ((double)(end - start)) / CLOCKS_PER_SEC;
    printf("Mode: %s, Time: %f seconds\n", description, cpu_time_used);

    getrusage(RUSAGE_SELF, &usage);
    printf("Mode: %s, CPU Usage: %ld.%06ld seconds\n", description,
           usage.ru_utime.tv_sec, usage.ru_utime.tv_usec);
    printf("Mode: %s, Memory Usage: %ld KB\n", description, usage.ru_maxrss);
}

int main() {
    const char *target_word = "the";

    printf("Multiprocessing with Shared Memory ===\n");
    measure_performance(multiprocess_with_shared_memory, target_word, "Multiprocessing with Shared Memory");

    printf("\nMultithreading ===\n");
    measure_performance(multithreaded_execution, target_word, "Multithreading");

    return 0;
}


Writing word_count.c


In [5]:
# prompt: gcc %%writefile word_count.c
!gcc word_count.c -o word_count -pthread
!./word_count


Multiprocessing with Shared Memory ===
File: /content/drive/MyDrive/Colab Notebooks/calgary/progp, Total count of 'the': 0
File: /content/drive/MyDrive/Colab Notebooks/calgary/paper1, Total count of 'the': 434
File: /content/drive/MyDrive/Colab Notebooks/calgary/progc, Total count of 'the': 89
File: /content/drive/MyDrive/Colab Notebooks/calgary/paper2, Total count of 'the': 757
File: /content/drive/MyDrive/Colab Notebooks/calgary/trans, Total count of 'the': 108
File: /content/drive/MyDrive/Colab Notebooks/calgary/progl, Total count of 'the': 46
File: /content/drive/MyDrive/Colab Notebooks/calgary/bib, Total count of 'the': 146

Top 50 Most Frequent Words:
the: 1580
of: 1199
%A: 1197
and: 886
to: 783
a: 782
%T: 726
%D: 702
*: 670
in: 638
;: 559
is: 522
for: 465
%J: 457
%P: 443
%K: 432
if: 412
=: 396
%O: 371
be: 332
it: 319
%V: 304
that: 277
%I: 264
on: 262
begin: 258
%C: 255
The: 254
then: 244
with: 218
%N: 211
*/: 198
/*: 197
by: 190
as: 189
else: 187
.sp: 180
end;: 180
this: 169
an: