# CommandLine.sh

## Presentation of data:

In [11]:
import pandas as pd

# Read the TSV file into a DataFrame
file_path = 'merged_courses.tsv'
data = pd.read_csv(file_path, sep='\t')

# Display the DataFrame
data

Unnamed: 0,courseName,universityName,facultyName,isItFullTime,description,startDate,fees,modality,duration,city,country,administration,url
0,3D Design for Virtual Environments - MSc,Glasgow Caledonian University,School of Engineering and Built Environment,Full time,3D visualisation and animation play a role in ...,September,Please see the university website for further ...,MSc,1 year full-time,Glasgow,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
1,Accounting and Finance - MSc,University of Leeds,Leeds University Business School,Full time,Businesses and governments rely on sound finan...,September,"UK: £18,000 (Total) International: £34,750 (To...",MSc,1 year full time,Leeds,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
2,"Accounting, Accountability & Financial Managem...",King’s College London,King’s Business School,Full time,"Our Accounting, Accountability & Financial Man...",September,Please see the university website for further ...,MSc,1 year FT,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
3,"Accounting, Financial Management and Digital B...",University of Reading,Henley Business School,Full time,Embark on a professional accounting career wit...,September,Please see the university website for further ...,MSc,1 year full time,Reading,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
4,Addictions MSc,King’s College London,"Institute of Psychiatry, Psychology and Neuros...",Full time,Join us for an online session for prospective ...,September,Please see the university website for further ...,MSc,One year FT,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,Materials and Molecular Modelling MSc,University College London,Department of Chemistry,Full time,Register your interest in graduate study at UC...,September,"Full time - £14,100",MSc,1 year full time,London,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
5996,Materials Chemistry - MSc,University of Bradford,Faculty of Life Sciences,Full time,We provide a unique Master’s education in Mate...,September,Please see the university website for further ...,MSc,1 year full time,Bradford,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
5997,Materials Chemistry MSc,University of Edinburgh,School of Chemistry,Full time,Programme descriptionMaterials Chemistry has e...,September,Tuition fees vary between degree programmes. F...,MSc,1 year full-time,Edinburgh,United Kingdom,On Campus,https://www.findamasters.com/masters-degrees/c...
5998,Materials Engineering,University of Padua,School of Engineering,Full time,The Master's degree Materials Engineering is a...,October,Our tuition fees will not exceed 2700 euros pe...,MSc,2 years,Padua,Italy,On Campus,https://www.findamasters.com/masters-degrees/c...


## Here is the bash code for CommandLine.sh :

In [None]:
#!/bin/bash

# Create an empty merged file
touch merged_courses.tsv

for ((i = 1; i <= 6000; i++)); do
  folder="HTML_folders/page_${i}"
  file="html_${i}.html.tsv"

  if [ $i -eq 1 ]; then
    # For the first file, copy the whole content
    cat "${folder}/${file}" >> merged_courses.tsv
  else
    # For files 2 to 6000, omit the first row
    tail -n +2 "${folder}/${file}" >> merged_courses.tsv
  fi
done

printf "The merged_courses.tsv file is generated. \n"

# Question-1:

printf "# Question-1: \n"

# Read 'merged_courses.tsv' file and generate counts for each country
country_count=$(awk -F'\t' 'NR>1 { countries[$11]++ } END { for (country in countries) print "{\"country\": \"" country "\", \"counts\": " countries[country] "}" }' merged_courses.tsv)

# Sort the country count based on counts in descending order
sorted_country_count=$(echo "$country_count" | jq -s 'sort_by(-.counts)')

echo "$sorted_country_count" | jq '.[:5]'

# Extract the value of "country" from the first cell and store it in most_frequent_country
most_frequent_country=$(echo "$sorted_country_count" | jq -r '.[0].country')

echo "Most frequent country: $most_frequent_country"

# Loop over 'merged_courses.tsv' file and filter rows by most frequent country
city_list=$(awk -F'\t' -v most_frequent_country="$most_frequent_country" 'NR>1 && $11 == most_frequent_country {
    cities[$10]++;
}
END {
    for (city in cities) {
        printf "{\"country\": \"%s\", \"city\": \"%s\", \"city_occurrence\": %d}\n", most_frequent_country, city, cities[city];
    }
}' merged_courses.tsv | jq -s '.')

# Sort the city_list based on city_occurrence
sorted_city_list=$(echo "$city_list" | jq 'sort_by(-.city_occurrence)')

echo "$sorted_city_list" | jq '.[:5]'

# Extract the maximum city_occurrence value
max_occurrence=$(echo "$sorted_city_list" | jq '[.[] | .city_occurrence] | max')

# Find cities with the maximum city_occurrence
max_cities=$(echo "$sorted_city_list" | jq --arg max_occurrence "$max_occurrence" "[.[] | select(.city_occurrence == $max_occurrence) | .city]")

printf "\n The most Master's Degrees are in the following cities: "
echo "$max_cities"

# Question-2:

printf "# Question-2: \n"

# Initialize an empty array to store 'part_time' rows
part_time=()

# Read 'merged_courses.tsv' line by line
while IFS=$'\t' read -r col1 col2 col3 isItFullTime col5; do
    # Check if the value in the 'isItFullTime' column is 'Part time'
    if [ "$isItFullTime" = "Part time" ]; then
        # If true, add the entire row to the 'part_time' array
        part_time+=("$col1" "$col2" "$col3" "$isItFullTime" "$col5")
    fi
done < merged_courses.tsv

# Print the length of the 'part_time' array
echo "Number of colleges offering Part-Time education is: ${#part_time[@]}"

# Question-3:

printf "# Question-3: \n"

# Initialize an empty array to store rows containing 'Engineer' in 'courseName'
contain_engineer=()

# Read 'merged_courses.tsv' line by line
while IFS=$'\t' read -r courseName col2 col3 col4 col5; do
    # Check if 'courseName' column contains 'Engineer'
    if [[ "$courseName" == *"Engineer"* ]]; then
        # If true, add the entire row to 'contain_engineer' array
        contain_engineer+=("$courseName" "$col2" "$col3" "$col4" "$col5")
    fi
done < merged_courses.tsv

# Print the length of the 'contain_engineer' array
echo "Length of 'contain_engineer' list: ${#contain_engineer[@]}"

# Calculate the length of the 'contain_engineer' array
length=${#contain_engineer[@]}

# Perform the calculation using bc for floating-point arithmetic
result=$(echo "scale=2; $length / 6000 * 100" | bc)

echo "The percentage of courses in Engineering is: $result%"


### Here is the result:

In [None]:
(base) armanfeili@Armans-MacBook-Pro ADM-HW3 % bash CommandLine.sh
The merged_courses.tsv file is generated. 
# Question-1: 
[
  {
    "country": "United Kingdom",
    "counts": 4479
  },
  {
    "country": "Ireland",
    "counts": 251
  },
  {
    "country": "Netherlands",
    "counts": 210
  },
  {
    "country": "USA",
    "counts": 182
  },
  {
    "country": "Germany",
    "counts": 125
  }
]
Most frequent country: United Kingdom
[
  {
    "country": "United Kingdom",
    "city": "London",
    "city_occurrence": 1086
  },
  {
    "country": "United Kingdom",
    "city": "Glasgow",
    "city_occurrence": 296
  },
  {
    "country": "United Kingdom",
    "city": "Edinburgh",
    "city_occurrence": 247
  },
  {
    "country": "United Kingdom",
    "city": "Nottingham",
    "city_occurrence": 124
  },
  {
    "country": "United Kingdom",
    "city": "Bristol",
    "city_occurrence": 123
  }
]

 The most Master's Degrees are in the following cities: [
  "London"
]
# Question-2: 
Number of colleges offering Part-Time education is: 3185
# Question-3: 
Length of 'contain_engineer' list: 3075
The percentage of courses in Engineering is: 51.00%

![](./CommandLine-screenshot.png)