In [1]:
import pandas as pd

# Load the uploaded file to examine its content
file_path = 'sorted_Data.csv'
data = pd.read_csv(file_path)

# Display the first few rows to understand its structure
data.head(15)


Unnamed: 0,SchoolDepartment,CourseTitle,Finish Time,Email,RequiredSkill,Professors,GainedSkill,strartingOrAsgnmtDate,Hour
0,engineering leadership and strategy,Challenge Coin,,jbeney8@unesco.org,technical writing,Rachid Khattabi,graphic design,2017-11-16,18:13:00
1,governance and policy studies,Committees,,fbucknell4@clickbank.net,issue and risk management,Hajar El Amrani,writing and editing,2017-12-12,19:27:00
2,governance and policy studies,7 Things to do in District 7,,fbucknell4@clickbank.net,video production and editing,Hajar El Amrani,"graphic design, proofreading and quality assur...",2017-12-12,19:33:00
3,governance and policy studies,District 6 Christmas Card,,fbucknell4@clickbank.net,"printed material (posters, brochures, reports)",Hajar El Amrani,"writing and editing, technical documentation, ...",2017-12-12,19:39:00
4,environmental and civil engineering,EVENT: Volunteer Recognition certificates,,dwetherheadb@livejournal.com,"printed material (posters, brochures, reports)",Samira Alaoui,"event coordination, graphic design, technical ...",2018-01-02,18:13:00
5,environmental and civil engineering,Volunteer Recognition Brunch RSVP reminder,,dwetherheadb@livejournal.com,internal and external communication,Samira Alaoui,"internal and external communication, writing a...",2018-01-03,00:28:00
6,governance and policy studies,Committee Selection Graphic,,fbucknell4@clickbank.net,issue and risk management,Hajar El Amrani,graphic design,2018-01-03,17:27:00
7,engineering leadership and strategy,End of the Year Video,,bwidmoora@alibaba.com,video production and editing,Nawal Chraibi,"writing and editing, video editing and product...",2018-01-03,20:10:00
8,social impact engineering,Free workshop on what you need to know about r...,,dwetherheadb@livejournal.com,news writing and editing,Samira Alaoui,"public outreach and engagement, news and media...",2018-01-03,22:03:00
9,environmental and civil engineering,Extra-Hire Job Opportunity with Parks and Recr...,,dwetherheadb@livejournal.com,internal and external communication,Samira Alaoui,"internal and external communication, proofread...",2018-01-06,17:40:00


In [3]:
from datetime import timedelta

# Combine date and time into a single datetime column
data['StartTime'] = pd.to_datetime(data['strartingOrAsgnmtDate'] + ' ' + data['Hour'])

# Sort by professor and start time
data = data.sort_values(by=['Professors', 'StartTime']).reset_index(drop=True)

# Initialize a column for FinishTime as NaT (Not a Time)
data['FinishTime'] = pd.NaT

# Group by professor
professor_groups = data.groupby('Professors')

# Iterate through each professor group
for professor, group in professor_groups:
    course_queue = []  # Track up to 5 courses at a time
    for index, row in group.iterrows():
        # Check if we have reached the 6th course in the queue
        if len(course_queue) == 5:
            # Pop the first course and assign the 6th course's start time as its finish time
            first_course_index = course_queue.pop(0)
            data.loc[first_course_index, 'FinishTime'] = row['StartTime']
        # Add the current course index to the queue
        course_queue.append(index)

# Save the updated dataset
data.to_csv('updated_sorted_Data.csv', index=False)

# Display a preview
print(data[['Professors', 'StartTime', 'FinishTime']].head(15))


         Professors           StartTime          FinishTime
0   Ahmed Bouzoubaa 2020-02-10 22:37:00 2020-03-17 21:53:00
1   Ahmed Bouzoubaa 2020-02-10 22:49:00 2020-03-23 17:37:00
2   Ahmed Bouzoubaa 2020-03-11 00:49:00 2020-03-23 17:51:00
3   Ahmed Bouzoubaa 2020-03-13 21:25:00 2020-03-24 20:18:00
4   Ahmed Bouzoubaa 2020-03-13 21:30:00 2020-04-09 18:33:00
5   Ahmed Bouzoubaa 2020-03-17 21:53:00 2020-04-09 22:13:00
6   Ahmed Bouzoubaa 2020-03-23 17:37:00 2020-05-11 20:24:00
7   Ahmed Bouzoubaa 2020-03-23 17:51:00 2020-06-10 17:19:00
8   Ahmed Bouzoubaa 2020-03-24 20:18:00 2020-06-11 01:17:00
9   Ahmed Bouzoubaa 2020-04-09 18:33:00 2020-06-11 17:11:00
10  Ahmed Bouzoubaa 2020-04-09 22:13:00 2020-07-10 00:14:00
11  Ahmed Bouzoubaa 2020-05-11 20:24:00 2020-07-20 16:26:00
12  Ahmed Bouzoubaa 2020-06-10 17:19:00 2020-07-20 19:28:00
13  Ahmed Bouzoubaa 2020-06-11 01:17:00 2020-07-20 19:57:00
14  Ahmed Bouzoubaa 2020-06-11 17:11:00 2020-07-27 17:39:00


In [5]:
print(data.isnull().sum())

SchoolDepartment            0
CourseTitle                 0
Finish Time              2133
Email                       0
RequiredSkill               0
Professors                  0
GainedSkill                 0
strartingOrAsgnmtDate       0
Hour                        0
StartTime                   0
FinishTime                 58
dtype: int64


In [None]:
# Combine date and time into a single datetime column
data['StartTime'] = pd.to_datetime(data['strartingOrAsgnmtDate'] + ' ' + data['Hour'])

# Ensure FinishTime column exists; if not, create it with NaT
if 'FinishTime' not in data.columns:
    data['FinishTime'] = pd.NaT
else:
    data['FinishTime'] = pd.to_datetime(data['FinishTime'], errors='coerce')

# Sort by professor and start time
data = data.sort_values(by=['Professors', 'StartTime']).reset_index(drop=True)

# Group by professor
professor_groups = data.groupby('Professors')

# Iterate through each professor group
for professor, group in professor_groups:
    course_queue = []  # Track up to 5 courses at a time
    for index, row in group.iterrows():
        # Check if we have reached the 6th course in the queue
        if len(course_queue) == 5:
            # Pop the first course and assign the 6th course's start time as its finish time
            first_course_index = course_queue.pop(0)
            data.loc[first_course_index, 'FinishTime'] = row['StartTime']
        # Add the current course index to the queue
        course_queue.append(index)

# Save the updated dataset
data.to_csv('updated_sorted_Data.csv', index=False)

# Display a preview
print(data[['Professors', 'StartTime', 'FinishTime']].head(15))

In [None]:
print(data.isnull().sum())

In [None]:
# Fill remaining empty cells in FinishTime with "Not Finished"
data['FinishTime'] = data['FinishTime'].fillna("Not Finished")

# Save the updated dataset
data.to_csv('updated_sorted_Data.csv', index=False)

# Display a preview
print(data[['Professors', 'StartTime', 'FinishTime']].head(15))


In [None]:
print(data.isnull().sum())

In [None]:
print(data.notnull().sum())

In [None]:
data

In [25]:
import pandas as pd

# Sample DataFrame
data = {
    'RequiredSkill': ['Python, Java, SQL', 'Python', 'Java, C++', 'SQL', 'C++']
}
df = pd.DataFrame(data)

# Step 1: Split the skills by commas and expand them into separate rows
skills_split = df['RequiredSkill'].str.split(',', expand=True)

# Step 2: Melt the DataFrame to turn columns into rows
skills_normalized = skills_split.melt(value_name='Skill').dropna()

# Step 3: Clean up extra spaces and ensure uniqueness
skills_normalized['Skill'] = skills_normalized['Skill'].str.strip()
unique_skills = skills_normalized['Skill'].drop_duplicates().reset_index(drop=True)

# Display the resulting list of unique skills
unique_skills.head()


0    Python
1      Java
2       SQL
3       C++
Name: Skill, dtype: object