In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from  nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.model_selection import train_test_split
from  sklearn.metrics import accuracy_score
import random
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
twitter_data = pd.read_csv('twitter_data_stemmed.csv')
twitter_data['stemmed_content'].replace({np.nan: ''}, inplace=True)
nan_counts = twitter_data.isna().sum()
print(nan_counts)

target             0
id                 0
date               0
flag               0
user               0
text               0
stemmed_content    0
dtype: int64


In [3]:
twit = twitter_data[["stemmed_content", "target"]]
twit.count()

stemmed_content    1600000
target             1600000
dtype: int64

In [4]:
twit.sample(frac=1).reset_index(drop=True)

Unnamed: 0,stemmed_content,target
0,head hurt,0
1,meet ellen lunch,1
2,mtv movi award tonit hp sneak peek fucck chyeahh,1
3,chantiparnel awww bless,0
4,welshstev thank bro think late night dubai cha...,1
...,...,...
1599995,batteri die x fairrrrrrr wanna play x,0
1599996,followfriday nickcharney give faith hope hell ...,1
1599997,david henri http twitpic com fumn awwwww preci...,1
1599998,man arm hurt like hell get vaccin start colleg,0


In [5]:
positive_array = twit[twit["target"] == 1].reset_index(drop=True)
negative_array = twit[twit["target"] == 0].reset_index(drop=True)
print(positive_array.shape)
print(negative_array.shape)
#  Concatenate the two subsets to create the final DataFrame with equal counts of both classes
new_df = pd.concat([positive_array, negative_array])

# Shuffle the rows to randomize the order
new_df = new_df.sample(frac=1).reset_index(drop=True)
new_df.count()

(800000, 2)
(800000, 2)


stemmed_content    1600000
target             1600000
dtype: int64

In [18]:
from sklearn.model_selection import train_test_split

# Separate positive and negative samples
positive_samples = new_df[new_df['target'] == 1]
negative_samples = new_df[new_df['target'] == 0]

# Split positive samples into training and testing sets
positive_train, positive_test = train_test_split(positive_samples, test_size=0.2, random_state=42)

# Split negative samples into training and testing sets
negative_train, negative_test = train_test_split(negative_samples, test_size=0.2, random_state=42)

# Combine positive and negative training sets to create balanced training set
balanced_train = pd.concat([positive_train, negative_train])

# Combine positive and negative testing sets to create balanced testing set
balanced_test = pd.concat([positive_test, negative_test])

# Shuffle the rows to randomize the order
balanced_train = balanced_train.sample(frac=1).reset_index(drop=True)
balanced_test = balanced_test.sample(frac=1).reset_index(drop=True)

# Print the shapes of the balanced training and testing sets
print("Balanced training set shape:", balanced_train.shape)
print("Balanced testing set shape:", balanced_test.shape)


Balanced training set shape: (1280000, 2)
Balanced testing set shape: (320000, 2)


In [33]:
print(balanced_test)

                                          stemmed_content  target
0                            soozenw methink poo come way       1
1                                destinysport anyth angel       1
2                            realli want make pooram year       0
3       love brother birthday sang ridicul song voicem...       1
4       secretbeav may heal comfort quick possibl may ...       1
...                                                   ...     ...
319995  hey teamdemi lovato awww got go say bye hehe t...       0
319996                                   massiv morn stuf       1
319997                                    think conan hot       1
319998                 go work black drunk time juci hate       0
319999                             kamsmommi naw nevr got       0

[320000 rows x 2 columns]


In [6]:
# Separate positive and negative tweets
positive_tweets = new_df[new_df['target'] == 1]
negative_tweets = new_df[new_df['target'] == 0]

# Function to generate and plot word clouds
def generate_word_cloud(data, sentiment):
    all_text = ' '.join(data['stemmed_content'])
    
    # Generate word cloud
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

    # Plot word cloud
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'Word Cloud for {sentiment} Tweets')
    plt.axis('off')
    plt.show()


In [19]:
def create_frequency_df(new_df):
    # Initialize frequency dictionaries
    positive_freq_dict = defaultdict(int)
    negative_freq_dict = defaultdict(int)

    # Iterate over each row in the DataFrame
    for index, row in new_df.iterrows():
        # Tokenize the stemmed content into individual words
        words = row['stemmed_content'].split()
        # Update frequency dictionaries based on target class
        for word in words:
            if row['target'] == 1:
                positive_freq_dict[word] += 1
            else:
                negative_freq_dict[word] += 1

    # Combine positive and negative frequency dictionaries to get unique words
    unique_words = set(positive_freq_dict.keys()).union(negative_freq_dict.keys())

    # Create a DataFrame from frequency dictionaries
    frequency_df = pd.DataFrame({
        'positive_freq': [positive_freq_dict[word] for word in unique_words],
        'neg_freq': [negative_freq_dict[word] for word in unique_words]
    },index = list(unique_words))
    frequency_df['total_freq'] = frequency_df['positive_freq'] + frequency_df['neg_freq']
    return frequency_df



# Create the frequency DataFrame
frequency_df = create_frequency_df(balanced_train)

# Display the frequency DataFrame
print(frequency_df)


               positive_freq  neg_freq  total_freq
digita                     0         1           1
smileychub                 1         0           1
tualp                      0         1           1
andrevr                    7         8          15
prodhack                   1         0           1
...                      ...       ...         ...
sshayler                   1         0           1
pisd                       0         4           4
extant                     1         0           1
lirraangelica              0         1           1
ahj                       10        17          27

[461054 rows x 3 columns]


In [20]:
total_positive_sum = frequency_df['positive_freq'].sum()
total_negative_sum = frequency_df['neg_freq'].sum()

v = len(frequency_df)

In [21]:
frequency_df['w|pos'] = (frequency_df['positive_freq'] + 1)/(total_positive_sum + v)
frequency_df['w|neg'] = (frequency_df['neg_freq'] + 1)/(total_negative_sum + v)

frequency_df['lamda'] = np.log(frequency_df['w|pos']/frequency_df['w|neg'])

In [22]:
print(frequency_df['w|pos'].sum())
print(frequency_df['w|neg'].sum())

1.0
0.9999999999999999


In [23]:
print(frequency_df.head())

            positive_freq  neg_freq  total_freq         w|pos         w|neg  \
digita                  0         1           1  1.842891e-07  3.711613e-07   
smileychub              1         0           1  3.685782e-07  1.855806e-07   
tualp                   0         1           1  1.842891e-07  3.711613e-07   
andrevr                 7         8          15  1.474313e-06  1.670226e-06   
prodhack                1         0           1  3.685782e-07  1.855806e-07   

               lamda  
digita     -0.700131  
smileychub  0.686163  
tualp      -0.700131  
andrevr    -0.124767  
prodhack    0.686163  


In [38]:
import pandas as pd

# Function to calculate accuracy of a prediction function on testing data
def calculate_accuracy(predict_function, X_test, y_test):
    correct_predictions = 0
    total_samples = len(X_test)
    
    # Iterate over each sample in the testing data
    for i in range(total_samples):
        # Use the predict_function to predict sentiment for the current sample
        predicted_label = predict_function(X_test[i])
        
        # Compare the predicted label with the true label
        if predicted_label == y_test[i]:
            correct_predictions += 1
    
    # Calculate accuracy
    accuracy = correct_predictions / total_samples
    return accuracy

# Assuming you have your predict_sentiment function and testing data ready
def predict_sentiment(text):
    log_likelihood = 0
    log_prior = 0
    
    for word in text.split():
        try:
            lamda_value = frequency_df.loc[word, 'lamda']
        except KeyError:
            continue  # Skip this word if it's not found in frequency_df
        else:
            if not pd.isna(lamda_value):
                log_likelihood += lamda_value
    
    # Add log prior to the log-likelihood score
    log_score = log_likelihood + log_prior
    
    # Predict sentiment based on log score
    if log_score > 0:
        return 1
    elif log_score < 0:
        return 0
    else:
        return -1

# Calculate accuracy on testing set
accuracy = calculate_accuracy(predict_sentiment, balanced_test['stemmed_content'], balanced_test['target']) 
print("Accuracy on testing set:", accuracy*100)


Accuracy on testing set: 76.185625


In [24]:
port_stem = PorterStemmer()

def stemming (content):
    stemmed_content = re.sub('[^a-zA-Z]', ' ', content)  # remove non alphabetic characters and replace them with
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    
    return stemmed_content

In [25]:
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import numpy as np

# Preprocessing function
def preprocess_text(text):
    port_stem = PorterStemmer()
    stop_words = set(stopwords.words('english'))

    # Remove non-alphabetic characters and convert to lowercase
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = text.lower()

    # Tokenize and remove stopwords
    words = text.split()
    words = [port_stem.stem(word) for word in words if word not in stop_words]

    print(words)
    # Join the words back into a string
    processed_text = ' '.join(words)

    return words


In [30]:
text = "@smarrison i would've been the first, but i didn't have a gun.    not really though, zac snyder's just a doucheclown"
words_sample_test = preprocess_text(text)

['smarrison', 'would', 'first', 'gun', 'realli', 'though', 'zac', 'snyder', 'doucheclown']


In [31]:
loglikelihood =0
for i in words_sample_test:
    if pd.isna(frequency_df.loc[i, 'lamda']) :
        pass 
    else:
        loglikelihood += frequency_df.loc[i, 'lamda']

print(loglikelihood)           


-1.2943784419326936


In [28]:
def predict_sentiment(text):
    # Compute the log-likelihood score for the tweet
    words_sample_test = preprocess_text(text)
    log_likelihood =0
    log_prior =0
    for i in words_sample_test:
        if pd.isna(frequency_df.loc[i, 'lamda']) :
            pass 
        else:
            log_likelihood += frequency_df.loc[i, 'lamda']
    # Add log prior to the log-likelihood score
    log_score = log_likelihood + log_prior
    
    # Predict sentiment based on log score
    if log_score > 0:
        return 1
    elif log_score < 0:
        return 0
    else:
        return -1





In [29]:
text = input("Enter you statement for sentiment analysis")
predict_sentiment(text)

['ram', 'go', 'mumbai']


0