In [2]:
# Import Pandas
import pandas as pd

# Load the dataset into a Pandas DataFrame with specified encoding
df = pd.read_csv("test.csv", encoding='latin1')

# Remove the last 1000 rows from the DataFrame permanently
df.drop(df.tail(1281).index, inplace=True)

# Display the first few rows of the DataFrame
df

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0
...,...,...,...,...,...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,noon,21-30,Nicaragua,6624554.0,120340.0,55.0
3530,416863ce47,All alone in this old house again. Thanks for...,positive,night,31-45,Niger,24206644.0,1266700.0,19.0
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,morning,46-60,Nigeria,206139589.0,910770.0,226.0
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,noon,60-70,North Korea,25778816.0,120410.0,214.0


In [4]:
import re

# Function to remove URLs from a single text
def remove_urls(text):
    # Regular expression pattern to match URLs
    url_pattern = r'(https?://\S+|www\.\S+)'
    return re.sub(url_pattern, '', text)

# Apply the remove_urls function to the text column and replace the existing column with the cleaned text
df['clean_url_text'] = df['text'].apply(remove_urls)

In [5]:
df

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_url_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0,Last session of the day
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0,Shanghai is also really exciting (precisely -...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0,"Recession hit Veronique Branquinho, she has to..."
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0,happy bday!
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0,- I like it!!
...,...,...,...,...,...,...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,noon,21-30,Nicaragua,6624554.0,120340.0,55.0,"its at 3 am, im very tired but i can`t sleep ..."
3530,416863ce47,All alone in this old house again. Thanks for...,positive,night,31-45,Niger,24206644.0,1266700.0,19.0,All alone in this old house again. Thanks for...
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,morning,46-60,Nigeria,206139589.0,910770.0,226.0,I know what you mean. My little dog is sinkin...
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,noon,60-70,North Korea,25778816.0,120410.0,214.0,_sutra what is your next youtube video gonna b...


In [6]:
import nltk
from nltk.corpus import stopwords

# Download NLTK stopwords data
nltk.download('stopwords')

# Set of English stopwords
stop_words = set(stopwords.words('english'))

# Function to remove stopwords from a single text
def remove_stopwords(text):
    tokens = nltk.word_tokenize(text)
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return ' '.join(filtered_tokens)

# Apply the remove_stopwords function to a DataFrame column containing text
df['clean_stopwords_text'] = df['clean_url_text'].apply(remove_stopwords)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\CCL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
df

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_url_text,clean_stopwords_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0,Last session of the day,Last session day
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0,Shanghai is also really exciting (precisely -...,Shanghai also really exciting ( precisely -- s...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0,"Recession hit Veronique Branquinho, she has to...","Recession hit Veronique Branquinho , quit comp..."
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0,happy bday!,happy bday !
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0,- I like it!!,- like ! !
...,...,...,...,...,...,...,...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,noon,21-30,Nicaragua,6624554.0,120340.0,55.0,"its at 3 am, im very tired but i can`t sleep ...","3 , im tired ` sleep try"
3530,416863ce47,All alone in this old house again. Thanks for...,positive,night,31-45,Niger,24206644.0,1266700.0,19.0,All alone in this old house again. Thanks for...,alone old house . Thanks net keeps alive kicki...
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,morning,46-60,Nigeria,206139589.0,910770.0,226.0,I know what you mean. My little dog is sinkin...,know mean . little dog sinking depression ... ...
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,noon,60-70,North Korea,25778816.0,120410.0,214.0,_sutra what is your next youtube video gonna b...,_sutra next youtube video gon na ? love videos !


In [10]:
import re

# Function to remove digits from a single text
def remove_digits(text):
    return re.sub(r'\d+', '', text)

# Apply the remove_digits function to a DataFrame column containing text
df['clean_digits_text'] = df['clean_stopwords_text'].apply(remove_digits)

In [11]:
df

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_url_text,clean_stopwords_text,clean_digits_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0,Last session of the day,Last session day,Last session day
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0,Shanghai is also really exciting (precisely -...,Shanghai also really exciting ( precisely -- s...,Shanghai also really exciting ( precisely -- s...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0,"Recession hit Veronique Branquinho, she has to...","Recession hit Veronique Branquinho , quit comp...","Recession hit Veronique Branquinho , quit comp..."
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0,happy bday!,happy bday !,happy bday !
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0,- I like it!!,- like ! !,- like ! !
...,...,...,...,...,...,...,...,...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,noon,21-30,Nicaragua,6624554.0,120340.0,55.0,"its at 3 am, im very tired but i can`t sleep ...","3 , im tired ` sleep try",", im tired ` sleep try"
3530,416863ce47,All alone in this old house again. Thanks for...,positive,night,31-45,Niger,24206644.0,1266700.0,19.0,All alone in this old house again. Thanks for...,alone old house . Thanks net keeps alive kicki...,alone old house . Thanks net keeps alive kicki...
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,morning,46-60,Nigeria,206139589.0,910770.0,226.0,I know what you mean. My little dog is sinkin...,know mean . little dog sinking depression ... ...,know mean . little dog sinking depression ... ...
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,noon,60-70,North Korea,25778816.0,120410.0,214.0,_sutra what is your next youtube video gonna b...,_sutra next youtube video gon na ? love videos !,_sutra next youtube video gon na ? love videos !


In [12]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Initialize the Porter Stemmer
porter = PorterStemmer()

# Function to perform stemming on a single text
def stem_text(text):
    tokens = word_tokenize(text)
    stemmed_tokens = [porter.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

# Apply stemming to a DataFrame column containing text
df['stemmed_text'] = df['clean_digits_text'].apply(stem_text)


In [13]:
df

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_url_text,clean_stopwords_text,clean_digits_text,stemmed_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0,Last session of the day,Last session day,Last session day,last session day
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0,Shanghai is also really exciting (precisely -...,Shanghai also really exciting ( precisely -- s...,Shanghai also really exciting ( precisely -- s...,shanghai also realli excit ( precis -- skyscra...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0,"Recession hit Veronique Branquinho, she has to...","Recession hit Veronique Branquinho , quit comp...","Recession hit Veronique Branquinho , quit comp...","recess hit veroniqu branquinho , quit compani ..."
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0,happy bday!,happy bday !,happy bday !,happi bday !
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0,- I like it!!,- like ! !,- like ! !,- like ! !
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,noon,21-30,Nicaragua,6624554.0,120340.0,55.0,"its at 3 am, im very tired but i can`t sleep ...","3 , im tired ` sleep try",", im tired ` sleep try",", im tire ` sleep tri"
3530,416863ce47,All alone in this old house again. Thanks for...,positive,night,31-45,Niger,24206644.0,1266700.0,19.0,All alone in this old house again. Thanks for...,alone old house . Thanks net keeps alive kicki...,alone old house . Thanks net keeps alive kicki...,alon old hous . thank net keep aliv kick ! who...
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,morning,46-60,Nigeria,206139589.0,910770.0,226.0,I know what you mean. My little dog is sinkin...,know mean . little dog sinking depression ... ...,know mean . little dog sinking depression ... ...,know mean . littl dog sink depress ... want mo...
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,noon,60-70,North Korea,25778816.0,120410.0,214.0,_sutra what is your next youtube video gonna b...,_sutra next youtube video gon na ? love videos !,_sutra next youtube video gon na ? love videos !,_sutra next youtub video gon na ? love video !


In [22]:
import spacy
spacy.cli.download("en_core_web_sm")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [26]:
spacy.cli.download("en_core_web_md")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [29]:
import spacy

# Load the spaCy English language model
nlp = spacy.load('en_core_web_sm')

# Function to perform lemmatization on a single text
def lemmatize_text(text):
    doc = nlp(text)
    lemmatized_tokens = [token.lemma_ for token in doc]
    return ' '.join(lemmatized_tokens)

# Apply lemmatization to a DataFrame column containing text
df['lemmatized_text'] = df['clean_digits_text'].apply(lemmatize_text)


In [31]:
df

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_url_text,clean_stopwords_text,clean_digits_text,stemmed_text,lemmatized_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0,Last session of the day,Last session day,Last session day,last session day,last session day
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0,Shanghai is also really exciting (precisely -...,Shanghai also really exciting ( precisely -- s...,Shanghai also really exciting ( precisely -- s...,shanghai also realli excit ( precis -- skyscra...,Shanghai also really exciting ( precisely -- s...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0,"Recession hit Veronique Branquinho, she has to...","Recession hit Veronique Branquinho , quit comp...","Recession hit Veronique Branquinho , quit comp...","recess hit veroniqu branquinho , quit compani ...","recession hit Veronique Branquinho , quit comp..."
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0,happy bday!,happy bday !,happy bday !,happi bday !,happy bday !
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0,- I like it!!,- like ! !,- like ! !,- like ! !,- like ! !
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3529,e5f0e6ef4b,"its at 3 am, im very tired but i can`t sleep ...",negative,noon,21-30,Nicaragua,6624554.0,120340.0,55.0,"its at 3 am, im very tired but i can`t sleep ...","3 , im tired ` sleep try",", im tired ` sleep try",", im tire ` sleep tri",", I m tired ` sleep try"
3530,416863ce47,All alone in this old house again. Thanks for...,positive,night,31-45,Niger,24206644.0,1266700.0,19.0,All alone in this old house again. Thanks for...,alone old house . Thanks net keeps alive kicki...,alone old house . Thanks net keeps alive kicki...,alon old hous . thank net keep aliv kick ! who...,alone old house . thank net keep alive kicking...
3531,6332da480c,I know what you mean. My little dog is sinkin...,negative,morning,46-60,Nigeria,206139589.0,910770.0,226.0,I know what you mean. My little dog is sinkin...,know mean . little dog sinking depression ... ...,know mean . little dog sinking depression ... ...,know mean . littl dog sink depress ... want mo...,know mean . little dog sink depression ... wan...
3532,df1baec676,_sutra what is your next youtube video gonna b...,positive,noon,60-70,North Korea,25778816.0,120410.0,214.0,_sutra what is your next youtube video gonna b...,_sutra next youtube video gon na ? love videos !,_sutra next youtube video gon na ? love videos !,_sutra next youtub video gon na ? love video !,_ sutra next youtube video go to ? love video !


In [37]:
import json

# Load the JSON file containing the contraction dictionary
with open('contractions.json', 'r') as f:
    contraction_dict = json.load(f)

# Function to expand contractions using the loaded contraction dictionary
def expand_contractions(text):
    for contraction, expansions in contraction_dict.items():
        if isinstance(expansions, list):
            expansions = ' '.join(expansions)
        text = text.replace(contraction, expansions)
    return text

# Apply the expand_contractions function to a DataFrame column containing text
df['expanded_text'] = df['lemmatized_text'].apply(expand_contractions)

In [40]:
df.head(15)

Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²),clean_url_text,clean_stopwords_text,clean_digits_text,stemmed_text,lemmatized_text,expanded_text
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0,Last session of the day,Last session day,Last session day,last session day,last session day,last session day
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0,Shanghai is also really exciting (precisely -...,Shanghai also really exciting ( precisely -- s...,Shanghai also really exciting ( precisely -- s...,shanghai also realli excit ( precis -- skyscra...,Shanghai also really exciting ( precisely -- s...,Shanghai also really exciting ( precisely -- s...
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0,"Recession hit Veronique Branquinho, she has to...","Recession hit Veronique Branquinho , quit comp...","Recession hit Veronique Branquinho , quit comp...","recess hit veroniqu branquinho , quit compani ...","recession hit Veronique Branquinho , quit comp...","recession hit Veronique Branquinho , quit comp..."
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0,happy bday!,happy bday !,happy bday !,happi bday !,happy bday !,happy bday !
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0,- I like it!!,- like ! !,- like ! !,- like ! !,- like ! !,- like ! !
5,726e501993,that`s great!! weee!! visitors!,positive,night,70-100,Antigua and Barbuda,97929.0,440.0,223.0,that`s great!! weee!! visitors!,` great ! ! weee ! ! visitors !,` great ! ! weee ! ! visitors !,` great ! ! weee ! ! visitor !,` great ! ! weee ! ! visitor !,` great ! ! weee ! ! visitor !
6,261932614e,I THINK EVERYONE HATES ME ON HERE lol,negative,morning,0-20,Argentina,45195774.0,2736690.0,17.0,I THINK EVERYONE HATES ME ON HERE lol,THINK EVERYONE HATES lol,THINK EVERYONE HATES lol,think everyon hate lol,think everyone hate lol,think everyone hate lol
7,afa11da83f,"soooooo wish i could, but im in school and my...",negative,noon,21-30,Armenia,2963243.0,28470.0,104.0,"soooooo wish i could, but im in school and my...","soooooo wish could , im school myspace complet...","soooooo wish could , im school myspace complet...","soooooo wish could , im school myspac complet ...","soooooo wish could , I m school myspace comple...","soooooo wish could , I m school myspace comple..."
8,e64208b4ef,and within a short time of the last clue all ...,neutral,night,31-45,Australia,25499884.0,7682300.0,3.0,and within a short time of the last clue all ...,within short time last clue,within short time last clue,within short time last clue,within short time last clue,within short time last clue
9,37bcad24ca,What did you get? My day is alright.. haven`...,neutral,morning,46-60,Austria,9006398.0,82400.0,109.0,What did you get? My day is alright.. haven`...,get ? day alright .. ` done anything yet . lea...,get ? day alright .. ` done anything yet . lea...,get ? day alright .. ` done anyth yet . leav s...,get ? day alright .. ` do anything yet . leave...,get ? day alright .. ` do anything yet . leave...
