In [19]:
import pandas as pd
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori, association_rules

# Import internet data
df = pd.read_csv('Internet_cleaned_file.csv')

# Filter other industry category
nonempty_keywords_df = df[df['industry_category']!= 'Other']

# Save as a new CSV file
nonempty_keywords_df.to_csv('int_nonempty_keywords.csv', index=False)

In [20]:

# Read the data from the CSV file
df = pd.read_csv('int_nonempty_keywords.csv')

# Select the relevant attributes for association rule mining
attributes = ['salary_label', 'exp_en', 'eduBack_en', 'scale_en','Comp_en', 'city_en', 'industry_category']

df = df[attributes]

# Drop any rows with missing values
df = df.dropna()

# Convert the data into a list of transactions
transactions = df.values.tolist()

# Convert the list of transactions into a binary encoded format
te = TransactionEncoder()
te_ary = te.fit_transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Perform association rule mining using the Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Filter rules based on lift greater than 1
rules = rules[rules['lift'] > 1]

# Sort the rules by lift in descending order
rules = rules.sort_values(by='lift', ascending=False)

# Print the generated rules with support, confidence, and lift as percentages
for idx, rule in rules.iterrows():
    antecedent = ', '.join(rule['antecedents'])
    consequent = ', '.join(rule['consequents'])
    support = rule['support'] * 100
    confidence = rule['confidence'] * 100
    lift = round(rule['lift'],2)
    
    print(f'Antecedent: {antecedent}\nConsequent: {consequent}\nSupport: {support:.2f}%\nConfidence: {confidence:.2f}%\nLift: {lift}\n')


Antecedent: Operations, Private
Consequent: 5000-10000
Support: 10.28%
Confidence: 53.45%
Lift: 1.9

Antecedent: 5000-10000, Private
Consequent: Operations
Support: 10.28%
Confidence: 52.80%
Lift: 1.86

Antecedent: Operations
Consequent: 5000-10000
Support: 14.73%
Confidence: 51.96%
Lift: 1.85

Antecedent: 5000-10000
Consequent: Operations
Support: 14.73%
Confidence: 52.46%
Lift: 1.85

Antecedent: 5000-10000, Private
Consequent: Junior College
Support: 10.32%
Confidence: 53.01%
Lift: 1.61

Antecedent: 5000-10000
Consequent: Junior College
Support: 14.18%
Confidence: 50.49%
Lift: 1.53

Antecedent: 20000-50000
Consequent: Bachelor
Support: 13.01%
Confidence: 72.26%
Lift: 1.38

Antecedent: 20-99
Consequent: Private
Support: 20.54%
Confidence: 84.76%
Lift: 1.38

Antecedent: Development
Consequent: Bachelor
Support: 18.88%
Confidence: 69.15%
Lift: 1.32

Antecedent: 15000-20000
Consequent: Bachelor
Support: 13.46%
Confidence: 67.87%
Lift: 1.3

Antecedent: 3-5 years
Consequent: Bachelor
Suppo

In [17]:
# Import finance data
df = pd.read_csv('Finance_cleaned_file.csv')

# Filter other industry category
nonempty_keywords_df = df[df['industry_category']!= 'Other']

# Save as a new CSV file
nonempty_keywords_df.to_csv('fin_nonempty_keywords.csv', index=False)

In [18]:
# Read the data from the CSV file
df = pd.read_csv('fin_nonempty_keywords.csv')

# Select the relevant attributes for association rule mining
attributes = ['salary_label', 'exp_en', 'eduBack_en', 'scale_en','Comp_en', 'city_en', 'industry_category','Comp_en']

df = df[attributes]

# Drop any rows with missing values
df = df.dropna()

# Convert the data into a list of transactions
transactions = df.values.tolist()

# Convert the list of transactions into a binary encoded format
te = TransactionEncoder()
te_ary = te.fit_transform(transactions)
df_encoded = pd.DataFrame(te_ary, columns=te.columns_)

# Perform association rule mining using the Apriori algorithm
frequent_itemsets = apriori(df_encoded, min_support=0.1, use_colnames=True)
rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.5)

# Filter rules based on lift greater than 1
rules = rules[rules['lift'] > 1]

# Sort the rules by lift in descending order
rules = rules.sort_values(by='lift', ascending=False)

# Print the generated rules with support, confidence, and lift as percentages
for idx, rule in rules.iterrows():
    antecedent = ', '.join(rule['antecedents'])
    consequent = ', '.join(rule['consequents'])
    support = rule['support'] * 100
    confidence = rule['confidence'] * 100
    lift = round(rule['lift'],2)
    
    print(f'Antecedent: {antecedent}\nConsequent: {consequent}\nSupport: {support:.2f}%\nConfidence: {confidence:.2f}%\nLift: {lift}\n')


Antecedent: 20-99
Consequent: Private
Support: 10.92%
Confidence: 71.60%
Lift: 1.9

Antecedent: Bachelor, Private
Consequent: Investment
Support: 10.32%
Confidence: 72.00%
Lift: 1.7

Antecedent: Over 10000
Consequent: Junior College
Support: 10.52%
Confidence: 60.35%
Lift: 1.62

Antecedent: Bachelor
Consequent: Investment
Support: 26.13%
Confidence: 65.30%
Lift: 1.54

Antecedent: Investment
Consequent: Bachelor
Support: 26.13%
Confidence: 61.78%
Lift: 1.54

Antecedent: Investment, Private
Consequent: Bachelor
Support: 10.32%
Confidence: 59.89%
Lift: 1.5

Antecedent: Insurance
Consequent: Junior College
Support: 12.29%
Confidence: 50.47%
Lift: 1.36

Antecedent: No Work Experience Required, Banking
Consequent: Junior College
Support: 10.01%
Confidence: 50.23%
Lift: 1.35

Antecedent: Junior College, Banking
Consequent: No Work Experience Required
Support: 10.01%
Confidence: 64.16%
Lift: 1.33

Antecedent: Over 10000
Consequent: No Work Experience Required
Support: 11.16%
Confidence: 64.03%