In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

import os
import sys
from tempfile import NamedTemporaryFile
from urllib.request import urlopen
from urllib.parse import unquote, urlparse
from urllib.error import HTTPError
from zipfile import ZipFile
import tarfile
import shutil

CHUNK_SIZE = 40960
DATA_SOURCE_MAPPING = 'social-media-usage-and-emotional-well-being:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-data-sets%2F5043355%2F8460631%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240602%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240602T155958Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D2b58ff96b03ea41e0e89ee0d11e9d9e130e6b8240902e4189d46257fd78470189fd2280dd05c15b5b689c5d18a4288a52f5f7126771bdf80fd5c2cd369684ac3e5699575947a2fd281caa809172d885a1589ee990392d0083966475fbb65308826ef871fc564b3ddd05f8adedffad98d3edb6769482233cce8271e676b34acec537c9bd490c8183cab4adf907ccb1a929f9779a9023a3c77b01d9cc9b80d8bfd98b9b624635e00b91c0f8626bfc46eb0f5dc5c9b08e750c3d1b237a0555ffa71f4c416fb3c117db5a061c164ac1bdd54e69f818be19ff211a6f3f2f23aa67c2d80ba81ae9565316d428357331c8ae11d6eed1a32ddbaf6825a6a14f9d4a53a38'

KAGGLE_INPUT_PATH='/kaggle/input'
KAGGLE_WORKING_PATH='/kaggle/working'
KAGGLE_SYMLINK='kaggle'

!umount /kaggle/input/ 2> /dev/null
shutil.rmtree('/kaggle/input', ignore_errors=True)
os.makedirs(KAGGLE_INPUT_PATH, 0o777, exist_ok=True)
os.makedirs(KAGGLE_WORKING_PATH, 0o777, exist_ok=True)

try:
  os.symlink(KAGGLE_INPUT_PATH, os.path.join("..", 'input'), target_is_directory=True)
except FileExistsError:
  pass
try:
  os.symlink(KAGGLE_WORKING_PATH, os.path.join("..", 'working'), target_is_directory=True)
except FileExistsError:
  pass

for data_source_mapping in DATA_SOURCE_MAPPING.split(','):
    directory, download_url_encoded = data_source_mapping.split(':')
    download_url = unquote(download_url_encoded)
    filename = urlparse(download_url).path
    destination_path = os.path.join(KAGGLE_INPUT_PATH, directory)
    try:
        with urlopen(download_url) as fileres, NamedTemporaryFile() as tfile:
            total_length = fileres.headers['content-length']
            print(f'Downloading {directory}, {total_length} bytes compressed')
            dl = 0
            data = fileres.read(CHUNK_SIZE)
            while len(data) > 0:
                dl += len(data)
                tfile.write(data)
                done = int(50 * dl / int(total_length))
                sys.stdout.write(f"\r[{'=' * done}{' ' * (50-done)}] {dl} bytes downloaded")
                sys.stdout.flush()
                data = fileres.read(CHUNK_SIZE)
            if filename.endswith('.zip'):
              with ZipFile(tfile) as zfile:
                zfile.extractall(destination_path)
            else:
              with tarfile.open(tfile.name) as tarfile:
                tarfile.extractall(destination_path)
            print(f'\nDownloaded and uncompressed: {directory}')
    except HTTPError as e:
        print(f'Failed to load (likely expired) {download_url} to path {destination_path}')
        continue
    except OSError as e:
        print(f'Failed to load {download_url} to path {destination_path}')
        continue

print('Data source import complete.')


In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np
import pandas as pd
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import math
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
#Here I am using only the training values for both testing and training purpose
train_df = pd.read_csv('/kaggle/input/social-media-usage-and-emotional-well-being/train.csv',
                       on_bad_lines='skip')


In [None]:
print(train_df.head())

print(train_df.isnull().sum())

In [None]:
train_df.dropna(inplace=True)
print(train_df.isnull().sum())

In [None]:

#also gender has invalid data
train_df['Gender'] = train_df['Gender'].str.strip().str.lower()
train_df['Gender'].unique()




In [None]:
valid_genders = ['male', 'female', 'non-binary']
train_df = train_df[train_df['Gender'].isin(valid_genders)]
train_df['Gender'].unique()

In [None]:
#age also has invalid data
train_df['Age'].unique()

In [None]:
fig=plt.figure(figsize=(25,25))


ax=sns.countplot(x='Dominant_Emotion',data=train_df,hue= 'Gender')
for p in ax.patches:
    height = p.get_height()
    ax.annotate(f'{height}', (p.get_x() + p.get_width() / 2., height),
                ha='center', va='center', fontsize=12, color='black',
                xytext=(0, 5), textcoords='offset points')

plt.tight_layout()
plt.grid()
plt.show()

In [None]:
ax2 =sns.countplot(x= 'Dominant_Emotion',data=train_df,hue = 'Platform')

for p in ax2.patches:
    height = p.get_height()
    ax2.annotate(f'{height}', (p.get_x() + p.get_width() / 2., height),
                ha='center', va='center', fontsize=8, color='black',
                xytext=(0, 1), textcoords='offset points')

plt.tight_layout()
plt.grid()
plt.show()

In [None]:
    # preparing data
    one = OneHotEncoder(sparse_output=False)
    #Gender Encoding
    encoded_Gender = one.fit_transform(train_df[['Gender']])
    labels = ['Female','Male','Non-Binary']
    gender_df = pd.DataFrame(encoded_Gender,columns=labels)
    print(gender_df)

In [None]:
#Encoding Platforms
encoded_Platform  = one.fit_transform(train_df[['Platform']])
labels_platform  = ['FaceBook','Instagram','LinkedIN','Snapchat','Telegram','Twitter','WhatsApp']
platform_df= pd.DataFrame(encoded_Platform,columns=labels_platform)

print(platform_df)






Since we require the data as numerics we have use one hot encoder to convert the data to numerics

Moreover since we have this data as 0 or 1 we dont require to scale the data

We will scale the rest of the data and then we will simply add these encoded columns at the end

In [None]:
y_train = train_df['Dominant_Emotion']
x_train = train_df.drop(train_df[['Dominant_Emotion','Gender','Platform','User_ID']],axis =1)


#Scaling data


scale =StandardScaler()
column = x_train.columns


x_train = scale.fit_transform(x_train)
scaled_x = pd.DataFrame(x_train , columns=column)
scaled_x  =pd.concat([scaled_x,gender_df],axis=1)
scaled_x = pd.concat([scaled_x,platform_df],axis=1)
print(scaled_x.columns)
train_x ,test_x , train_y , test_y = train_test_split(scaled_x,y_train,test_size=0.2,random_state=1)
print(train_x.info())

In [None]:
# training a model
scorelist = []
namelist = []
#using SVM Model
svm =SVC(random_state=1)
svm.fit(train_x,train_y)

score = svm.score(test_x,test_y)*100
scorelist.append (score)

namelist.append ("svm")
print(score)
#Visulising Result
x=np.arange(1,len(test_y)+1)
sns.scatterplot(x=x,y=test_y,color = 'blue' ,data=None , alpha = 0.8)
sns.scatterplot(x=x,y=svm.predict(test_x),data=None , color = 'orange',alpha = 0.6,marker = '*')
plt.show()


In [None]:
# using K Nearest Neighbour

n= math.sqrt(len(train_x))
n= math.floor(n)
if(n%2==0 ):
    n=n+1

knn= KNeighborsClassifier(n_neighbors=n)

knn.fit(train_x,train_y)


score = knn.score(test_x,test_y)*100
scorelist.append (score)

namelist.append ("knn")

print(score)
#Visulising Result Accuracy
x=np.arange(1,len(test_y)+1)
sns.scatterplot(x=x,y=test_y,color = 'blue' , alpha = 0.8)
sns.scatterplot(x=x,y=knn.predict(test_x),color = 'orange',alpha = 0.6,marker = '*')
plt.title("K Nearest Neighbour")
plt.show()

In [None]:
# using Logistic Regression
log = LogisticRegression(random_state=1)
log.fit(train_x,train_y)

score = log.score(test_x,test_y)*100
scorelist.append (score)
print(score)

namelist.append ("log")
#Visualising the accuracy
x=np.arange(1,len(test_y)+1)
sns.scatterplot(x=x,y=test_y,color = 'blue' , alpha = 0.8)
sns.scatterplot(x=x,y=log.predict(test_x),color = 'orange',alpha = 0.6,marker = '*')
plt.title("Logistic Regression")
plt.show()

In [None]:
#using Naive Bayes

nb = GaussianNB()
nb.fit(train_x,train_y)

score = nb.score(test_x,test_y)*100
scorelist.append (score)

namelist.append ("NB")
print(score)
#Visulising Result Accuracy
x=np.arange(1,len(test_y)+1)
sns.scatterplot(x=x,y=test_y,color = 'blue' , alpha = 0.8)
sns.scatterplot(x=x,y=nb.predict(test_x),color = 'orange',alpha = 0.6,marker = '*')
plt.title("Naive Bayes")
plt.show()

In [None]:
# using Decision Tree Classifier

tree = DecisionTreeClassifier(random_state=1)

tree.fit(train_x,train_y)

score = tree.score(test_x,test_y)*100
scorelist.append (score)

namelist.append ("DecTree")
print(score)
#Visulising Result Accuracy
x=np.arange(1,len(test_y)+1)
sns.scatterplot(x=x,y=test_y,color = 'blue' , alpha = 0.8)
sns.scatterplot(x=x,y=tree.predict(test_x),color = 'orange',alpha = 0.6,marker = '*')
plt.title("Decision Tree Classifier")
plt.show()

In [None]:
# using random forest clasifier
rfc= RandomForestClassifier(random_state=1)
rfc.fit(train_x,train_y)

score = rfc.score(test_x,test_y)*100
scorelist.append (score)

namelist.append ("RFC")
print(score)

#visualising accuracies
#Visulising Result Acuracy
x=np.arange(1,len(test_y)+1)
sns.scatterplot(x=x,y=test_y,color = 'blue' , alpha = 0.8)
sns.scatterplot(x=x,y=rfc.predict(test_x),color = 'orange',alpha = 0.6,marker = '*')
plt.title("Random Forest Classifier")
plt.show()

In [None]:

'''
def readData():
    Age = int(input("Enter your Age \n"))
    Gender = int(input("Enter your gender 1.Female 2.Male 3.Non-Binary\n"))
    print("Enter your most used platform")
    print("[1.'FaceBook',2.'Instagram',3.'LinkedIN',4.'Snapchat',5.'Telegram',6.'Twitter',7.'WhatsApp']")
    Platform = int(input())
    use_time = int(input("Enter avg daily use time in minutes\n"))
    posts = int(input("Enter avg posts per day \n"))
    likes = int(input("Enter avg likes per day\n"))
    comments = int(input("Enter avg comments per day \n"))
    messages = int(input("Enter avg messages sent received per day \n"))

    x_custom_test = {
        'Age': Age,
        'Daily_Usage_Time (minutes)': use_time,
        'Posts_Per_Day': posts,
        'Likes_Received_Per_Day': likes,
        'Comments_Received_Per_Day': comments,
        'Messages_Sent_Per_Day': messages,
        'Female': 1 if Gender == 1 else 0,
        'Male': 1 if Gender == 2 else 0,
        'Non-Binary': 1 if Gender == 3 else 0,
        'FaceBook': 1 if Platform == 1 else 0,
        'Instagram': 1 if Platform == 2 else 0,
        'LinkedIN': 1 if Platform == 3 else 0,
        'Snapchat': 1 if Platform == 4 else 0,
        'Telegram': 1 if Platform == 5 else 0,
        'Twitter': 1 if Platform == 6 else 0,
        'WhatsApp': 1 if Platform == 7 else 0
    }
    custom_test = pd.DataFrame(x_custom_test, index=[0])
    return rfc.predict(custom_test) '''


In [None]:
'''# fun
yes_no = int(input("Do you want to predict your Nature"))
#using random forest to predict data
if(yes_no==1):
    prediction= readData()
    print(prediction)


'''

if you want to try the last part just open my google collab nb that i will soon share