diff --git a/src/config/data.py b/src/config/data.py index 5a0f5ce..ab91ece 100644 --- a/src/config/data.py +++ b/src/config/data.py @@ -1,107 +1,175 @@ commit_types = { - "feat": ["add", "implement", "new", "introduce"], - "fix": ["fix", "resolve", "patch", "address"], - "docs": ["document", "update docs", "readme"], - "refactor": ["refactor", "restructure", "simplify"], - "test": ["test", "coverage", "spec"], - "style": ["format", "style", "lint"], - "perf": ["optimize", "performance", "speed"], - "chore": ["chore", "maintain", "update", "remove"], - "ci": ["ci", "pipeline", "workflow"], - "build": ["build", "dependency", "version"], + "feat": [ + "add", "implement", "introduce", "enable", "support", "create", "integrate", + "launch", "expand", "extend", "enhance", "develop", "prototype", "establish", "apply" + ], + "fix": [ + "fix", "resolve", "patch", "address", "correct", "repair", "handle", "debug", + "mitigate", "eliminate", "hotfix", "prevent", "recover", "rollback", "restore", "sanitize" + ], + "docs": [ + "document", "update", "revise", "clarify", "explain", "annotate", "expand", + "rewrite", "summarize", "edit", "describe", "correct", "detail", "review", "comment" + ], + "refactor": [ + "refactor", "restructure", "simplify", "cleanup", "redesign", "revamp", "reorganize", + "rework", "modularize", "decompose", "deduplicate", "optimize", "streamline", "improve" + ], + "test": [ + "test", "cover", "validate", "verify", "mock", "benchmark", "assert", + "simulate", "automate", "debug", "execute", "configure tests", "extend tests", "fix tests" + ], + "style": [ + "format", "style", "lint", "reformat", "standardize", "align", "adjust", + "fix formatting", "apply conventions", "refine", "clean up", "improve consistency", "update style" + ], + "perf": [ + "optimize", "improve", "accelerate", "reduce", "enhance", "boost", "scale", + "compress", "minimize", "streamline", "increase efficiency", "refine performance", "cache" + ], + "chore": [ + "maintain", "update", "remove", "tidy", "clean up", "upgrade", "refine", + "adjust", "improve", "synchronize", "configure", "manage", "enhance workflow", "fix config" + ], + "ci": [ + "configure", "update", "fix", "automate", "optimize", "improve", "adjust", + "modify", "setup", "enhance", "stabilize", "resolve", "debug pipeline", "fix workflow" + ], + "build": [ + "build", "upgrade", "install", "configure", "compile", "generate", "package", + "bundle", "setup", "update dependencies", "refactor build", "restructure build", "resolve build issue" + ], + "security": [ + "secure", "harden", "encrypt", "sanitize", "patch", "fix", "prevent", + "validate", "enforce", "strengthen", "protect", "lockdown", "restrict", "authenticate" + ], + "revert": [ + "revert", "undo", "rollback", "restore", "reset", "remove", + "uncommit", "reapply", "discard", "unmerge", "restore previous", "reverse" + ], + "deps": [ + "bump", "update", "upgrade", "downgrade", "synchronize", "install", + "remove", "fix dependency", "resolve conflict", "refresh", "pin version", "patch dependencies" + ], + "wip": [ + "draft", "prototype", "experiment", "iterate", "develop", "explore" + ], + "release": [ + "release", "deploy", "publish", "tag", "version", "finalize", + "ship", "prepare", "announce", "mark as stable" + ], + "i18n": [ + "translate", "localize", "adapt", "internationalize", "support", + "convert", "implement i18n", "update locales", "fix translation", "configure language settings" + ], + "a11y": [ + "improve", "enhance", "adjust", "enable", "optimize", "refine" + ], + "logging": [ + "log", "record", "track", "monitor", "report", "trace" + ], + "infra": [ + "deploy", "provision", "configure", "scale", "automate", "manage" + ] } example_commits = { - "feat": "feat(auth): implement OAuth2 with role-based access\n\nImplemented OAuth2 protocol with role-based control to enhance security and scalability.", - "fix": "fix(api): resolve data race in concurrent requests\n\nFixed a race condition by adding synchronization mechanisms to prevent concurrent data modifications.", - "docs": "docs(api): update authentication documentation\n\nUpdated API documentation to detail the new authentication methods and error handling procedures.", - "refactor": "refactor(core): simplify error handling logic\n\nRefactored error handling to remove redundancies and improve code maintainability.", - "chore": "chore(deps): update dependency versions to latest\n\nUpgraded dependencies to address security vulnerabilities and improve performance.", - "style": "style(components): format according to style guide\n\nReformatted code to comply with style guidelines for better readability and consistency.", - "perf": "perf(queries): optimize database index for faster lookups\n\nEnhanced database indexing strategy to improve query performance on large datasets.", - "test": "test(api): add integration tests for payment flow\n\nAdded integration tests to ensure reliable and consistent performance of the payment processing system." + "feat": "feat(auth): add OAuth2 with roles\n\nImplemented OAuth2 authentication with role-based access control.", + "fix": "fix(api): resolve data race issue\n\nFixed concurrency bug by synchronizing access to shared resources.", + "docs": "docs(api): expand auth docs\n\nUpdated API documentation with authentication flow and error handling details.", + "refactor": "refactor(core): restructure errors\n\nRemoved redundant code and improved error-handling maintainability.", + "chore": "chore(deps): upgrade dependencies\n\nUpdated dependency versions to fix vulnerabilities and enhance security.", + "style": "style(ui): refine button alignment\n\nImproved button positioning for consistent UI layout.", + "perf": "perf(db): optimize indexing\n\nRefined database indexes to enhance query performance and speed.", + "test": "test(auth): add OAuth2 tests\n\nImplemented tests to verify OAuth authentication and token management.", + "build": "build(ci): add Dockerfile\n\nCreated a Dockerfile for consistent application containerization.", + "ci": "ci(lint): enforce code style\n\nAdded ESLint rules to CI workflow to maintain consistent code quality.", + "revert": "revert(ui): undo theme changes\n\nReverted recent UI theme changes due to accessibility issues.", + "security": "security(auth): patch token leakage\n\nFixed issue where expired tokens could be reused under certain conditions.", + "i18n": "i18n(app): add Spanish translations\n\nIntegrated Spanish language support for multi-language accessibility.", + "deps": "deps(api): upgrade Django to 4.2\n\nUpdated Django version to latest stable release for security improvements.", + "wip": "wip(dashboard): redesign analytics page\n\nPartial implementation of analytics dashboard revamp.", + "release": "release(v1.2.0): prepare for deployment\n\nUpdated changelog and version number for the new release.", } commit_training_data = { "feat": [ - "feat(auth): implement JWT authentication flow\n\nImplemented JWT-based authentication with token expiration handling to secure user sessions.", - "feat(ui): add dark mode toggle with system preference detection\n\nAdded dark mode toggle that automatically adjusts based on system settings for improved user experience.", - "feat(api): implement rate limiting middleware\n\nIntroduced rate limiting to prevent API abuse and ensure system stability under high load.", - "feat(forms): add client-side form validation\n\nImplemented real-time form validation to provide immediate feedback and improve data integrity.", - "feat(search): implement elasticsearch integration\n\nIntegrated Elasticsearch to boost search performance and enhance result accuracy.", - "feat(cache): add Redis caching layer for API responses\n\nAdded a Redis caching layer to reduce response times and improve overall scalability.", - "feat(auth): implement social login providers\n\nEnabled social login functionality to simplify the authentication process for users.", - "feat(security): add two-factor authentication support\n\nIntroduced two-factor authentication to enhance account security and reduce fraud risks.", + "feat(api): add JWT authentication\n\nImplemented JWT-based authentication with token expiration and refresh.", + "feat(ui): support dark mode toggle\n\nAdded a dark mode switch with automatic detection of system preferences.", + "feat(api): enforce request throttling\n\nImplemented rate limiting to prevent abuse and ensure API stability.", + "feat(forms): enable live validation\n\nAdded real-time form validation for better user feedback and accuracy.", + "feat(search): integrate Elasticsearch\n\nBoosted search speed and accuracy by implementing Elasticsearch indexing.", + "feat(auth): add OAuth2 with role control\n\nImplemented OAuth2 authentication with role-based access restrictions.", + "feat(notifications): add email alerts\n\nIntroduced automated email notifications for user activity updates.", + "feat(logging): enhance API logs\n\nAdded structured logging for better error tracking and performance insights.", + "feat(db): support soft deletes\n\nImplemented soft deletion for records, preserving data while hiding it.", + "feat(files): support bulk uploads\n\nEnabled bulk file uploads with real-time progress tracking and validation.", ], "fix": [ - "fix(auth): resolve token refresh race condition\n\nFixed a race condition in the token refresh logic by implementing proper synchronization mechanisms.", - "fix(api): handle concurrent request deadlocks\n\nResolved API deadlocks by optimizing resource locking and request handling procedures.", - "fix(validation): correct email regex pattern\n\nUpdated the email validation regex to accurately handle various valid email formats.", - "fix(memory): resolve memory leak in WebSocket connections\n\nAddressed a memory leak by ensuring WebSocket connections are properly closed after use.", - "fix(security): patch SQL injection vulnerability\n\nPatched a SQL injection vulnerability by sanitizing user inputs and using parameterized queries.", - "fix(cors): resolve cross-origin request issues\n\nAdjusted CORS settings to correctly handle cross-origin requests and improve security.", - "fix(cache): handle cache invalidation edge cases\n\nFixed issues with cache invalidation to ensure data consistency across different layers.", - "fix(ui): resolve mobile viewport rendering issues\n\nCorrected viewport meta tag settings to improve rendering on mobile devices.", + "fix(auth): fix token refresh bug\n\nResolved issue where refresh tokens were not properly invalidated on logout.", + "fix(api): prevent request deadlocks\n\nOptimized transaction handling to eliminate API request deadlock issues.", + "fix(security): patch SQL injection\n\nSanitized database queries to mitigate SQL injection vulnerabilities.", + "fix(ui): fix button alignment issue\n\nCorrected button layout on mobile devices for better UI consistency.", + "fix(cache): prevent stale data reads\n\nFixed cache invalidation logic to ensure fresh data is always served.", + "fix(db): fix unique constraint errors\n\nResolved integrity errors caused by duplicate entries in database tables.", + "fix(notifications): ensure email delivery\n\nFixed issue where email notifications were not being sent reliably.", + "fix(websockets): fix real-time sync issue\n\nResolved WebSocket desync problem causing delayed chat messages.", + "fix(auth): prevent session hijacking\n\nStrengthened session management to prevent unauthorized access attempts.", + "fix(middleware): catch unexpected errors\n\nImproved error handling to prevent crashes from unhandled exceptions.", ], "docs": [ - "docs(api): update REST endpoints documentation\n\nRevised REST API documentation to include detailed information on new endpoints and error handling.", - "docs(setup): improve installation instructions\n\nEnhanced installation guide with step-by-step instructions and troubleshooting tips for new users.", - "docs(auth): document OAuth2 implementation details\n\nProvided comprehensive documentation covering OAuth2 flows, configuration, and security considerations.", - "docs(deploy): add AWS deployment guide\n\nCreated a detailed guide for deploying the application on AWS, including best practices and configuration tips.", - "docs(contributing): update PR guidelines\n\nUpdated contributing guidelines to reflect new review processes and code standards.", - "docs(api): add GraphQL schema documentation\n\nIncluded detailed documentation for the GraphQL schema to help developers understand query structures.", - "docs(security): document security best practices\n\nOutlined security best practices and compliance requirements for developers and auditors.", - "docs(testing): update e2e testing guide\n\nRevised the end-to-end testing documentation with new scenarios and tool integrations.", + "docs(readme): refine setup guide\n\nImproved installation instructions and added common troubleshooting steps.", + "docs(api): document OAuth2 flow\n\nDetailed OAuth2 integration, token management, and permission scopes.", + "docs(contrib): update PR guidelines\n\nClarified contribution process, review steps, and merge requirements.", + "docs(env): document .env config\n\nAdded environment variable documentation for better deployment clarity.", + "docs(db): improve migration guide\n\nUpdated database migration instructions with rollback and recovery steps.", + "docs(security): clarify auth flows\n\nDetailed security best practices and authentication flow explanations.", + "docs(logging): add log format details\n\nProvided logging conventions to ensure consistent debugging insights.", + "docs(errors): document API error codes\n\nListed API error responses with descriptions for better client handling.", + "docs(deps): explain dependency versions\n\nClarified dependency requirements and upgrade policies in README.", ], "refactor": [ - "refactor(api): split monolithic controller into modules\n\nRefactored the API controller into modular components to enhance maintainability and scalability.", - "refactor(db): optimize database query patterns\n\nImproved database performance by optimizing complex queries and reducing unnecessary joins.", - "refactor(auth): separate authentication logic\n\nIsolated authentication logic into a dedicated module for clearer structure and easier testing.", - "refactor(middleware): improve error handling flow\n\nStreamlined error handling within middleware to ensure consistent responses across the application.", - "refactor(utils): create shared utility functions\n\nExtracted common code into shared utilities to reduce duplication and simplify maintenance.", - "refactor(services): implement repository pattern\n\nAdopted the repository pattern in the services layer to decouple business logic from data access.", - "refactor(validation): centralize validation logic\n\nCentralized various validation routines into a single module for consistency and reuse.", - "refactor(config): improve configuration management\n\nRefactored configuration handling by separating environment-specific settings into distinct files.", + "refactor(auth): modularize auth logic\n\nSeparated authentication logic into modules for better maintainability.", + "refactor(db): optimize query structure\n\nRewrote complex queries for better performance and readability.", + "refactor(middleware): unify error handling\n\nStandardized error responses across all middleware layers.", + "refactor(routes): simplify API paths\n\nReorganized API routes for clarity and consistency across endpoints.", + "refactor(config): centralize settings\n\nMoved configuration settings to a single file for better manageability.", + "refactor(models): improve data relations\n\nRefactored database models to optimize relationships and indexing.", + "refactor(files): clean up temp storage\n\nImproved temporary file management to avoid unnecessary disk usage.", + "refactor(logging): use structured logs\n\nRefactored logging format to include request details and traceability.", ], "chore": [ - "chore(deps): update package dependencies to latest\n\nUpgraded all package dependencies to their latest versions to address security issues and improve performance.", - "chore(ci): update GitHub Actions workflow\n\nRevised the CI pipeline to streamline automated testing and deployment processes.", - "chore(docker): optimize container build process\n\nOptimized the Dockerfile to reduce image build times and improve container efficiency.", - "chore(lint): update ESLint configuration\n\nUpdated ESLint rules to enforce new coding standards and remove deprecated configurations.", - "chore(git): update gitignore patterns\n\nRefined the .gitignore file to exclude unnecessary files and reduce repository clutter.", - "chore(deps): remove unused dependencies\n\nCleaned up the project by removing outdated and unused dependencies to simplify maintenance.", - "chore(scripts): update build scripts\n\nEnhanced build scripts for better readability and efficiency during the deployment process.", - "chore(types): update TypeScript definitions\n\nUpdated TypeScript definition files to reflect recent changes in the codebase.", + "chore(deps): upgrade all dependencies\n\nUpdated dependencies to latest stable versions for security and stability.", + "chore(ci): streamline deployment\n\nOptimized CI/CD workflow to reduce build times and deployment latency.", + "chore(env): standardize .env file\n\nUpdated environment variable handling for consistency across projects.", + "chore(build): optimize webpack config\n\nImproved webpack bundling settings to reduce output file size.", + "chore(ci): enforce linting in pipeline\n\nAdded lint checks to CI workflow to prevent style violations.", + "chore(release): bump version to 1.2.0\n\nUpdated version number and changelog for the latest release.", + "chore(deploy): automate staging updates\n\nAdded scripts to auto-deploy updates to the staging environment.", ], "style": [ - "style(css): align with design system guidelines\n\nUpdated CSS styles to conform with the latest design system standards for better consistency.", - "style(components): update button styling\n\nRefined button styling to improve visual hierarchy and overall usability in the UI.", - "style(layout): improve responsive grid system\n\nEnhanced the grid layout to ensure consistent behavior across multiple device sizes.", - "style(theme): update color palette variables\n\nModified theme variables to reflect new branding and improve the overall aesthetic appeal.", - "style(forms): standardize input field styling\n\nStandardized the styling of form inputs for a cohesive look throughout the application.", - "style(fonts): update typography system\n\nUpdated typography settings to enhance readability and maintain visual consistency.", - "style(animations): refine transition effects\n\nImproved transition effects for smoother animations and better user interaction.", - "style(icons): update icon system to SVG\n\nReplaced icon fonts with SVG icons to ensure scalability and clarity on all devices.", + "style(css): unify form styling\n\nApplied consistent padding and border styles to all form fields.", + "style(lint): enforce ESLint rules\n\nConfigured ESLint to maintain code consistency and best practices.", + "style(ui): improve navbar spacing\n\nAdjusted spacing for better visual balance across different screen sizes.", + "style(html): format markup\n\nReformatted HTML files to ensure proper indentation and readability.", + "style(js): remove unused variables\n\nCleaned up unused JavaScript variables to improve maintainability.", + "style(tailwind): apply consistent themes\n\nUnified Tailwind themes for better visual consistency across UI.", ], "perf": [ - "perf(images): implement lazy loading strategy\n\nImplemented lazy loading for images to defer off-screen loading and improve page load times.", - "perf(api): add query result caching\n\nIntroduced caching for API query results to reduce response times and lower server load.", - "perf(db): optimize database indices\n\nRevised database indices to accelerate query performance and reduce data retrieval latency.", - "perf(bundle): reduce JavaScript bundle size\n\nMinimized bundle size by removing unused code and optimizing dependency imports.", - "perf(assets): implement CDN distribution\n\nConfigured CDN distribution for static assets to boost load times and global accessibility.", - "perf(queries): optimize database join operations\n\nEnhanced join query efficiency to better handle large datasets and reduce processing time.", - "perf(cache): implement LRU caching strategy\n\nAdopted an LRU caching strategy to improve memory management and response speed.", - "perf(api): implement response compression\n\nEnabled compression for API responses to decrease payload size and improve transfer speeds.", + "perf(images): enable lazy loading\n\nImplemented lazy loading for images to reduce initial page load time.", + "perf(db): optimize index usage\n\nRefined database indexing strategy to accelerate query execution speeds.", + "perf(api): reduce payload size\n\nOptimized API responses by removing redundant data from JSON output.", + "perf(cache): improve eviction policy\n\nUpdated cache policy to ensure high-priority items remain available.", + "perf(css): minimize CSS bundle size\n\nReduced CSS file size by eliminating unused styles and improving compression.", + "perf(worker): parallelize background jobs\n\nOptimized worker processes for faster job execution and resource usage.", ], "test": [ - "test(api): add integration tests for auth flow\n\nAdded comprehensive integration tests to validate the authentication flow under various scenarios.", - "test(ui): add unit tests for form validation\n\nImplemented unit tests to ensure that all form validations perform correctly and reliably.", - "test(e2e): add checkout flow tests\n\nDeveloped end-to-end tests to simulate the complete checkout process and identify any issues.", - "test(utils): improve test coverage for helpers\n\nEnhanced test coverage for utility functions to catch edge cases and improve overall stability.", - "test(auth): add OAuth callback tests\n\nAdded tests specifically for OAuth callback functionality to ensure proper third-party integration.", - "test(api): add load testing scenarios\n\nImplemented load testing to evaluate API performance under high traffic conditions.", - "test(security): add penetration testing suite\n\nIntroduced a penetration testing suite to identify and mitigate potential security vulnerabilities.", - "test(performance): add benchmark tests\n\nAdded benchmark tests to measure performance improvements and track regression over time.", + "test(auth): add OAuth2 flow tests\n\nAdded integration tests to verify OAuth2 authentication scenarios.", + "test(api): improve error case coverage\n\nExpanded test cases to cover edge conditions and failure scenarios.", + "test(forms): validate input constraints\n\nAdded form validation tests to prevent invalid user input submission.", + "test(cache): add expiration tests\n\nEnsured cached items expire correctly based on defined TTL settings.", + "test(logging): verify log outputs\n\nTested log messages to ensure they capture correct request details.", + "test(websockets): simulate real-time load\n\nStress-tested WebSocket connections under high concurrent usage.", ], } diff --git a/src/core/analyzer.py b/src/core/analyzer.py deleted file mode 100644 index 2469f23..0000000 --- a/src/core/analyzer.py +++ /dev/null @@ -1,276 +0,0 @@ -from sklearn.feature_extraction.text import TfidfVectorizer -from ..config.data import ( - commit_types, - example_commits, - commit_training_data, - semantic_patterns, - VALID_PAIRS, - LETTER_FREQUENCY -) -from fastapi import HTTPException, status -from sklearn.metrics.pairwise import cosine_similarity -from .models import CommitIssue -from datetime import datetime -import ast, string - - -class CommitAnalyzer: - """ - Analyzes Git commit messages using a combination of pattern matching, - machine learning, and semantic analysis to ensure commit quality and - provide improvement suggestions. - """ - def __init__(self, settings: list): - """Initializes the analyzer with custom settings and prepares the ML classifier.""" - self.settings = settings - self.vectorizer = TfidfVectorizer() - - self.commit_types = commit_types.copy() - self.example_commits = example_commits.copy() - self.commit_training_data = commit_training_data.copy() - self.semantic_patterns = semantic_patterns.copy() - - self.slack_url = None # Retrieved from settings - - try: - self._apply_data_settings() - except Exception as e: - raise HTTPException( - status_code=status.HTTP_400_BAD_REQUEST, - detail=f"Invalid settings data: {str(e)}", - ) - - self._prepare_ml_classifier() - - def _apply_data_settings(self): - """ - Updates analyzer configuration with custom settings provided through Telex. - Custom settings can override default commit types, examples, and training data. - """ - for setting in self.settings: - if setting["label"] == "commit_types": - self.commit_types.update(ast.literal_eval(setting["default"].replace("\n", "\\n"))) if setting["default"] else self.commit_types - if setting["label"] == "example_commits": - self.example_commits.update(ast.literal_eval(setting["default"].replace("\n", "\\n"))) if setting["default"] else self.example_commits - if setting["label"] == "training_data": - self.commit_training_data.update(ast.literal_eval(setting["default"].replace("\n", "\\n"))) if setting["default"] else self.commit_training_data - if setting["label"] == "slack_url": - self.slack_url = setting["default"] - - def _prepare_ml_classifier(self): - """ - Prepares the machine learning classifier for commit type prediction. - Transforms the training dataset into TF-IDF vectors for similarity-based - classification of commit messages. - """ - x_train = [] - y_train = [] - - for commit_type, messages in self.commit_training_data.items(): - x_train.extend(messages) - y_train.extend([commit_type] * len(messages)) - - self.vectorizer.fit(x_train) - self.x_train_vectorized = self.vectorizer.transform(x_train) - self.y_train = y_train - - def _check_format(self, message: str) -> list[CommitIssue]: - """Validates commit message format against conventional commit standards.""" - first_word = message.split("(")[0] if "(" in message else message.split(":")[0] - - if first_word.lower() not in self.commit_types: - likely_type = self._suggest_commit_type(message) - return [ - CommitIssue( - severity="high", - message="Invalid commit type", - suggestion=f"Use '{likely_type}' for this kind of change\n└─ Example:\n• ```{self.example_commits[likely_type]}```", - ) - ] - return [] - - def _suggest_commit_type(self, message: str) -> str: - """Suggests the most appropriate commit type using a three-stage analysis pipeline.""" - message = message.lower() - - type_scores = { - commit_type: sum(word in message for word in indicators) - for commit_type, indicators in self.commit_types.items() - } - - if any(score > 0 for score in type_scores.values()): - return max(type_scores.items(), key=lambda x: x[1])[0] - - message_vectorized = self.vectorizer.transform([message]) - similarities = cosine_similarity(message_vectorized, self.x_train_vectorized)[0] - - if max(similarities) > 0.3: # If we have a decent similarity match - most_similar_idx = similarities.argmax() - return self.y_train[most_similar_idx] - - semantic_patterns = self.semantic_patterns - semantic_scores = { - commit_type: sum( - 1 for pattern in patterns if any(word in message for word in pattern) - ) - for commit_type, patterns in semantic_patterns.items() - } - - if any(score > 0 for score in semantic_scores.values()): - return max(semantic_scores.items(), key=lambda x: x[1])[0] - - return "chore" - - def _check_gibberish(self, word: str) -> bool: - """ - Determines if a word is likely to be gibberish using multiple linguistic patterns. - - The function employs four distinct checks to identify gibberish: - 1. Vowel ratio: Words must maintain a minimum vowel-to-length ratio of 0.2 - 2. Consonant sequences: Flags sequences of more than 4 consecutive consonants - 3. Letter frequency: For words >= 4 chars, compares letter frequencies against English language norms - 4. Consonant pairs: Identifies invalid consonant combinations that rarely occur in English - - A word is considered gibberish if it fails two or more of these checks. - """ - VOWELS = set('aeiouyAEIOUY') - - word = word.lower().strip(string.punctuation) - if not word or len(word) < 2 or not word.isalpha(): - return False - - failed_checks = 0 - - vowel_count = sum(1 for c in word if c in VOWELS) - if vowel_count / len(word) < 0.2: - failed_checks += 1 - - consonant_sequence = 0 - for char in word: - if char not in VOWELS: - consonant_sequence += 1 - if consonant_sequence > 4: - failed_checks += 1 - break - else: - consonant_sequence = 0 - - if len(word) >= 4: - char_counts = {} - for char in word: - char_counts[char] = char_counts.get(char, 0) + 1 - - deviation = 0 - for char, count in char_counts.items(): - if char in LETTER_FREQUENCY: - expected = LETTER_FREQUENCY[char] / 100 - actual = count / len(word) - deviation += abs(expected - actual) - - if (deviation / len(char_counts)) > 0.5: - failed_checks += 1 - - invalid_pairs = 0 - for i in range(len(word) - 1): - pair = word[i:i+2] - if pair not in VALID_PAIRS and pair[0] not in VOWELS and pair[1] not in VOWELS: - invalid_pairs += 1 - if invalid_pairs > 1: - failed_checks += 1 - break - - return failed_checks >= 2 - - def _check_content_quality(self, message: str) -> list[CommitIssue]: - """ - Assesses the quality of the commit message content. - Checks if the message is too short or lacks sufficient detail. - Also checks the commit message for potential gibberish words. - """ - issues = [] - words = message.split() - word_count = len(words) - if word_count < 5: - issues.append(CommitIssue( - severity="high", - message="Commit message is too short", - suggestion="Try providing a brief summary that explains what change was made and why." - )) - elif word_count < 10: - issues.append(CommitIssue( - severity="medium", - message="Commit message might be too brief", - suggestion="Consider adding a bit more detail." - )) - - gibberish_words = [ - word.strip(string.punctuation) - for word in words - if self._check_gibberish(word) and word.strip(string.punctuation) - ] - if gibberish_words: - issues.append(CommitIssue( - severity="high", - message="Potential gibberish words detected in commit message", - suggestion=f"Review and correct the following words: {', '.join(gibberish_words)}" - )) - return issues - - def _check_context(self, message: str) -> list[CommitIssue]: - """ - Evaluates whether the commit message provides adequate context. - Checks for and suggests separation of the message into a subject and a detailed body if needed. - """ - issues = [] - if "\n\n" not in message: - issues.append(CommitIssue( - severity="medium", - message="Commit message may be missing detailed context", - suggestion="Consider splitting your commit message into a concise subject and a detailed body." - )) - return issues - - def analyze_commit(self, message: str) -> list[CommitIssue]: - """Analyzes a commit message and returns any quality issues found.""" - issues = [] - issues.extend([*self._check_format(message)]) - issues.extend([*self._check_content_quality(message)]) - issues.extend([*self._check_context(message)]) - return [issue for issue in issues if issue] - - def format_analysis(self, commit: dict, issues: list[CommitIssue]) -> str: - """Formats analysis results into a human-readable message for Slack.""" - icons = {"high": "šŸ”“", "medium": "🟔"} - - timestamp = datetime.fromisoformat(commit["timestamp"].replace("Z", "+00:00")) - formatted_time = timestamp.strftime("%-I:%M%p. %A, %B %-d, %Y.") - - author = commit["author"] - author_info = f"{author['name']} ({author.get('email')})" - - commit_details = ( - "šŸ“ *Commit Details*\n" - f"└─ Hash: `{commit['id'][:8]}`\n" - f"└─ Author: {author_info}\n" - f"└─ URL: <{commit['url']}|commit url>\n" - f"└─ Time: {formatted_time}\n" - f"└─ Message:\n" - f"• ```{commit['message']}```\n" - ) - - if issues: - issues_text = "\n".join( - f"{icons[issue.severity]} {issue.message}\n" - f" └─ {issue.suggestion.replace(chr(10), chr(10) + ' ')}" - for issue in sorted(issues, key=lambda x: x.severity) - ) - analysis_section = "\nšŸ” *Analysis Results*\n" f"{issues_text}\n" - - suggestions = ( - "\nšŸ’” Resources\n" - "└─ Conventional Commits: \n" - "└─ Commit Best Practices: \n" - "└─ Git Best Practices: " - ) - - return f"{commit_details}{analysis_section}{suggestions}" diff --git a/src/core/analyzer/__init__.py b/src/core/analyzer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/core/analyzer/analyzer.py b/src/core/analyzer/analyzer.py new file mode 100644 index 0000000..e7e0919 --- /dev/null +++ b/src/core/analyzer/analyzer.py @@ -0,0 +1,102 @@ +from ...config.data import ( + commit_types, + example_commits, + commit_training_data +) +from fastapi import HTTPException, status +from ..models import CommitIssue +from .format_analyzer import FormatAnalyzer +from .quality_analyzer import QualityAnalyzer +from datetime import datetime +import ast + + +class CommitAnalyzer: + """ + Analyzes Git commit messages using a combination of pattern matching, + machine learning, and semantic analysis to ensure commit quality and + provide improvement suggestions. + """ + def __init__(self, settings: list) -> None: + """Initializes the analyzer with custom settings and prepares the ML classifier.""" + self.settings = settings + self.slack_url = None # Retrieved from settings + self.commit_types = commit_types + self.example_commits = example_commits.copy() + self.commit_training_data = commit_training_data.copy() + + try: + self._apply_data_settings() + except Exception as e: + raise HTTPException( + status_code=status.HTTP_400_BAD_REQUEST, + detail=f"Invalid settings data: {str(e)}", + ) + + def _apply_data_settings(self) -> None: + """ + Updates analyzer configuration with custom settings provided through Telex. + Custom settings can override default commit types, examples, and training data. + Provides slack webhook url. + """ + for setting in self.settings: + if setting["label"] == "commit_types": + self.commit_types.update(ast.literal_eval(setting["default"].replace("\n", "\\n"))) if setting["default"] else self.commit_types + if setting["label"] == "example_commits": + self.example_commits.update(ast.literal_eval(setting["default"].replace("\n", "\\n"))) if setting["default"] else self.example_commits + if setting["label"] == "training_data": + self.commit_training_data.update(ast.literal_eval(setting["default"].replace("\n", "\\n"))) if setting["default"] else self.commit_training_data + if setting["label"] == "slack_url": + self.slack_url = setting["default"] + + def _check_content_format(self, message: str) -> list[CommitIssue]: + format_analyzer = FormatAnalyzer(message, self.commit_types, self.example_commits) + return format_analyzer.check_all() + + def _check_content_quality(self, message: str) -> list[CommitIssue]: + quality_analyzer = QualityAnalyzer(message) + return quality_analyzer.check_all() + + def analyze_commit(self, message: str) -> list[CommitIssue]: + """Analyzes a commit message and returns any quality issues found.""" + issues = [] + issues.extend([*self._check_content_format(message)]) + issues.extend([*self._check_content_quality(message)]) + return [issue for issue in issues if issue] + + def format_analysis(self, commit: dict, issues: list[CommitIssue]) -> str: + """Formats analysis results into a human-readable message for Slack.""" + icons = {"high": "šŸ”“", "medium": "🟔", "low": "šŸ”µ"} + + timestamp = datetime.fromisoformat(commit["timestamp"].replace("Z", "+00:00")) + formatted_time = timestamp.strftime("%-I:%M%p. %A, %B %-d, %Y.") + + author = commit["author"] + author_info = f"{author['name']} ({author.get('email')})" + + commit_details = ( + "šŸ“ *Commit Details*\n" + f"└─ Hash: `{commit['id'][:8]}`\n" + f"└─ Author: {author_info}\n" + f"└─ URL: <{commit['url']}|commit url>\n" + f"└─ Time: {formatted_time}\n" + f"└─ Message:\n" + f"• ```{commit['message']}```\n" + ) + + if issues: + issues_text = "\n".join( + f"{icons[issue.severity]} {issue.message}\n" + f" └─ {issue.suggestion.replace(chr(10), chr(10) + ' ')}" + for issue in sorted(issues, key=lambda x: x.severity) + ) + analysis_section = "\nšŸ” *Analysis Results*\n" f"{issues_text}\n" + + suggestions = ( + "\nšŸ’” Resources\n" + "└─ Conventional Commits: \n" + "└─ Commit Best Practices: \n" + "└─ Git Best Practices: " + ) + + return f"{commit_details}{analysis_section}{suggestions}" \ No newline at end of file diff --git a/src/core/analyzer/format_analyzer.py b/src/core/analyzer/format_analyzer.py new file mode 100644 index 0000000..8269f35 --- /dev/null +++ b/src/core/analyzer/format_analyzer.py @@ -0,0 +1,174 @@ +from sklearn.feature_extraction.text import TfidfVectorizer +import string +from ..models import CommitIssue +from ...config.data import semantic_patterns +from sklearn.metrics.pairwise import cosine_similarity + + +class FormatAnalyzer: + """ + Validates commit message format against conventional commit standards. + Checks for capitalization, puctuation, length. + Provides improvement suggestions + """ + def __init__(self, message: str, commit_types: dict[str, str], example_commits: dict[str, str]) -> None: + self.message = message + self.message_parts = self.message.split("\n\n", maxsplit=1) if "\n\n" in self.message else self.message.split("\n", maxsplit=1) + self.subject = self.message_parts[0] + self.body = self.message_parts[1] if len(self.message_parts) > 1 else None + self.commit_type = ( + self.subject.split("(")[0] if "(" in self.subject + else self.subject.split(":")[0] if ":" in self.subject + else None + ) + self.valid_commit_types = commit_types.copy() + self.example_commits = example_commits.copy() + self.semantic_patterns = semantic_patterns.copy() + self.issues = [] + + self.vectorizer = TfidfVectorizer() + + def _check_subject(self) -> None: + first_word = self.subject.split(":")[1].strip() if ":" in self.subject else None + if first_word and any([first_word[0] != first_word[0].lower(), not first_word[0].isalpha]): + self.issues.append( + CommitIssue( + severity="high", + message="Subject not in lowercase", + suggestion="Subject must start with a lowercase letter, unless it's a proper noun or acronym." + ) + ) + + if self.subject.endswith(tuple(string.punctuation)): + self.issues.append( + CommitIssue( + severity="high", + message="Subject ends with a punctuation", + suggestion="Remove the punctuation at the end of the subject line." + ) + ) + + if len(self.subject) > 50: + self.issues.append( + CommitIssue( + severity="high", + message="Subject exceeds 50 characters", + suggestion="Subject too long, keep it under 50 characters." + ) + ) + + def _check_body(self) -> None: + if not self.body: + self.issues.append( + CommitIssue( + severity="low", + message="Commit message may be missing detailed context", + suggestion="Consider splitting your commit message into a concise subject and a detailed body." + ) + ) + return + + if not "\n\n" in self.message: + self.issues.append( + CommitIssue( + severity="medium", + message="Body missing a blank line after subject", + suggestion="Add a blank line between the subject and body." + ) + ) + + for line in self.body.split("\n"): + if len(line) > 72: + self.issues.append( + CommitIssue( + severity="high", + message="Body lines exceed 72 characters", + suggestion="Body lines too long, wrap text at 72 characters per line." + ) + ) + + def _check_commit_type(self) -> None: + if not self.commit_type: + self.issues.append( + CommitIssue( + severity="high", + message="Commit message type unidentifiable", + suggestion="Seperate the commit type from the rest of the subject using ':'." + ) + ) + return + + if self.commit_type != self.commit_type.lower(): + self.issues.append( + CommitIssue( + severity="high", + message="Invalid commit type case", + suggestion="Commit type must be lowercase (e.g., fix, feat, docs)." + ) + ) + + if self.commit_type.lower() not in self.valid_commit_types: + likely_type = self._suggest_commit_type() + self.issues.append( + CommitIssue( + severity="high", + message="Invalid commit type", + suggestion=f"Use '{likely_type}' for this kind of change\n└─ Example:\n• ```{self.example_commits[likely_type]}```", + ) + ) + + def _suggest_commit_type(self) -> str: + """Suggests the most appropriate commit type using a three-stage analysis pipeline.""" + message = self.message.lower() + + type_scores = { + commit_type: sum(word in message for word in indicators) + for commit_type, indicators in self.valid_commit_types.items() + } + + if any(score > 0 for score in type_scores.values()): + return max(type_scores.items(), key=lambda x: x[1])[0] + + self._prepare_ml_classifier() + message_vectorized = self.vectorizer.transform([message]) + similarities = cosine_similarity(message_vectorized, self.x_train_vectorized)[0] + + if max(similarities) > 0.3: # If we have a decent similarity match + most_similar_idx = similarities.argmax() + return self.y_train[most_similar_idx] + + semantic_patterns = self.semantic_patterns + semantic_scores = { + commit_type: sum( + 1 for pattern in patterns if any(word in message for word in pattern) + ) + for commit_type, patterns in semantic_patterns.items() + } + + if any(score > 0 for score in semantic_scores.values()): + return max(semantic_scores.items(), key=lambda x: x[1])[0] + + return "chore" # Fallback value + + def _prepare_ml_classifier(self) -> None: + """ + Prepares the machine learning classifier for commit type prediction. + Transforms the training dataset into TF-IDF vectors for similarity-based + classification of commit messages. + """ + x_train = [] + y_train = [] + + for commit_type, messages in self.commit_training_data.items(): + x_train.extend(messages) + y_train.extend([commit_type] * len(messages)) + + self.vectorizer.fit(x_train) + self.x_train_vectorized = self.vectorizer.transform(x_train) + self.y_train = y_train + + def check_all(self) -> list[CommitIssue]: + self._check_subject() + self._check_body() + self._check_commit_type() + return self.issues \ No newline at end of file diff --git a/src/core/analyzer/quality_analyzer.py b/src/core/analyzer/quality_analyzer.py new file mode 100644 index 0000000..6717852 --- /dev/null +++ b/src/core/analyzer/quality_analyzer.py @@ -0,0 +1,92 @@ +import string +from ..models import CommitIssue +from ...config.data import LETTER_FREQUENCY, VALID_PAIRS + + +class QualityAnalyzer: + """ + Assesses the quality of the commit message content. + Checks the commit message for potential gibberish words. + """ + def __init__(self, message: str): + self.words = message.split() + self.issues = [] + + def _run_gibberish_check(self) -> None: + gibberish_words = [ + word.strip(string.punctuation) + for word in self.words + if self._check_gibberish(word) and word.strip(string.punctuation) + ] + if gibberish_words: + self.issues.append( + CommitIssue( + severity="high", + message="Potential gibberish words detected in commit message", + suggestion=f"Review and correct the following words: {', '.join(gibberish_words)}" + ) + ) + + def _check_gibberish(self, word: str) -> bool: + """ + Determines if a word is likely to be gibberish using multiple linguistic patterns. + + The function employs four distinct checks to identify gibberish: + 1. Vowel ratio: Words must maintain a minimum vowel-to-length ratio of 0.2 + 2. Consonant sequences: Flags sequences of more than 4 consecutive consonants + 3. Letter frequency: For words >= 4 chars, compares letter frequencies against English language norms + 4. Consonant pairs: Identifies invalid consonant combinations that rarely occur in English + + A word is considered gibberish if it fails two or more of these checks. + """ + VOWELS = set('aeiouyAEIOUY') + + word = word.lower().strip(string.punctuation) + if not word or len(word) < 2 or not word.isalpha(): + return False + + failed_checks = 0 + + vowel_count = sum(1 for c in word if c in VOWELS) + if vowel_count / len(word) < 0.2: + failed_checks += 1 + + consonant_sequence = 0 + for char in word: + if char not in VOWELS: + consonant_sequence += 1 + if consonant_sequence > 4: + failed_checks += 1 + break + else: + consonant_sequence = 0 + + if len(word) >= 4: + char_counts = {} + for char in word: + char_counts[char] = char_counts.get(char, 0) + 1 + + deviation = 0 + for char, count in char_counts.items(): + if char in LETTER_FREQUENCY: + expected = LETTER_FREQUENCY[char] / 100 + actual = count / len(word) + deviation += abs(expected - actual) + + if (deviation / len(char_counts)) > 0.5: + failed_checks += 1 + + invalid_pairs = 0 + for i in range(len(word) - 1): + pair = word[i:i+2] + if pair not in VALID_PAIRS and pair[0] not in VOWELS and pair[1] not in VOWELS: + invalid_pairs += 1 + if invalid_pairs > 1: + failed_checks += 1 + break + + return failed_checks >= 2 + + def check_all(self) -> list[CommitIssue]: + self._run_gibberish_check() + return self.issues \ No newline at end of file diff --git a/src/routers/telex.py b/src/routers/telex.py index 83fdecc..fcf6d57 100644 --- a/src/routers/telex.py +++ b/src/routers/telex.py @@ -1,6 +1,6 @@ from fastapi.routing import APIRouter from ..core.models import TelexTargetPayload -from ..core.analyzer import CommitAnalyzer +from ..core.analyzer.analyzer import CommitAnalyzer from ..config.integration_config import generate_json_config from fastapi.responses import JSONResponse from fastapi import status, HTTPException, Query @@ -31,18 +31,24 @@ async def telex_webhook( try: analyzer = CommitAnalyzer(settings=payload.settings) slack_url = analyzer.slack_url + + all_messages = [] # Accumulate messages for test mode + for commit in commit_message: violations = analyzer.analyze_commit(commit["message"]) if violations: output_message = {"text": analyzer.format_analysis(commit, violations)} - if is_test == "true": - return JSONResponse( - content=output_message["text"], - status_code=status.HTTP_200_OK, - ) - async with httpx.AsyncClient() as client: - await client.post(slack_url, json=output_message) + all_messages.append(output_message["text"]) + else: + async with httpx.AsyncClient() as client: + await client.post(slack_url, json={"text": output_message}) + + if is_test == "true": + return JSONResponse( + content=all_messages, + status_code=status.HTTP_200_OK, + ) except Exception as e: raise HTTPException( status_code=status.HTTP_400_BAD_REQUEST, @@ -63,4 +69,4 @@ async def get_integration_config() -> dict: detail=f"Error retrieving config data: {str(e)}", ) - return json_data + return json_data \ No newline at end of file diff --git a/tests/test_telex.py b/tests/test_telex.py index b0b4b20..98cc7b3 100644 --- a/tests/test_telex.py +++ b/tests/test_telex.py @@ -6,7 +6,7 @@ def test_send_from_telex_success(): response = client.post( "/webhook/telex?is_test=true", json={ - "message": '[{"id": "8ce4cf04f4rw6w8600675237350b14b4", "message": "fix(auth): child\n\nFixed a race condition in the token refresh logic by implementing proper synchronization mechanisms.", "timestamp": "2025-02-18T10:17:54+01:00", "url": "https://github.com/8", "author": {"name": "test", "email": "test@gmail.com"}}]', + "message": '[{"id": "8ce4cf04f4rw6w8600675237350b14b4", "message": "fix(auth): fix race condition\n\n- Added atomic transaction context in user creation functionality.", "timestamp": "2025-02-18T10:17:54+01:00", "url": "https://github.com/8", "author": {"name": "test", "email": "test@gmail.com"}}]', "settings": [ { "label": "commit_types", @@ -16,11 +16,16 @@ def test_send_from_telex_success(): "default": "{'feat': ['add', 'implement', 'new', 'introduce'], 'fix': ['fix', 'resolve', 'patch', 'address']}", }, { - "label": "Example Commits", + "label": "example_commits", "type": "text", "required": True, "description": "Set example commits for each custom commit type to guide new devs. These appear in suggestions when similar commits need fixing. Format: {'type1': 'example message1', 'type2': 'example message 2'}.", "default": "{'feat': 'feat(auth): implement OAuth2 with role-based access\n\nImplemented OAuth2 protocol with role-based control to enhance security and scalability.', 'fix': 'fix(api): resolve data race in concurrent requests\n\nFixed a race condition by adding synchronization mechanisms to prevent concurrent data modifications.'}" + }, + { + "label": "slack_url", + "type": "text", + "default": "https://slack.com" } ], }, @@ -35,7 +40,7 @@ def test_send_from_telex_failure(): response = client.post( "/webhook/telex?is_test=true", json={ - "message": '[{"id": "8ce4cf04f4rw6w8600675237350b14b4", "message": "fix(auth): child\n\nFixed a race condcvghdczhjvjhzcvhjvzhjvhjvczjonization mechanisms.", "timestamp": "2025-02-18T10:17:54+01:00", "url": "https://github.com/8", "author": {"name": "test", "email": "test@gmail.com"}}]', + "message": '[{"id": "8ce4cf04f4rw6w8600675237350b14b4", "message": "fix(auth): child jbcskb\n\nFixed a race condcvghdczhjvjhzcvhjvzhjvhjvczjonization mechanisms ashbcds.", "timestamp": "2025-02-18T10:17:54+01:00", "url": "https://github.com/8", "author": {"name": "test", "email": "test@gmail.com"}},{"id": "8ce4cf04f4rw6w8600675237350182b4", "message": "hello: publish notes here", "timestamp": "2025-03-18T10:17:54+01:00", "url": "https://github.com/8", "author": {"name": "test", "email": "test@gmail.com"}}]', "settings": [ { "label": "slack_url", @@ -63,5 +68,7 @@ def test_send_from_telex_failure(): ) assert response.status_code == 200 response_data = json.loads(response.content.decode()) - for word in ("Potential gibberish words", "too brief"): - assert word in response_data + for word in ("Potential gibberish words", "too long"): + assert word in response_data[0] + for word in ("Invalid commit type", "missing detailed context"): + assert word in response_data[1] \ No newline at end of file